from src.dataUtil.parseData import splitArray
from src.dataUtil.dataReader import queryData, queryMutableData
%load_ext autoreload
%autoreload 2
"""
Data queried is a dictionary with all the necessary fields stored in arrays
"""
dataQueried = \
{
    'imdb user rating':[], 
    'metascore': [], 
    'number of imdb user votes':[], 
    'producers': [],
    'awards':[]
}
"""
To get the data, define the diictionary and query
the data with queryData. This will fill the dictionary
with the necessary data.
"""
data = queryData("movie_corpus_kaggle_dataset/movie_meta_data.csv")
"""
To get number of rewards, run the function below
"""
def getRewardsNum(dataQueried):
    return splitArray(dataQueried['awards'], " ")
"""
To get number of producers, run the function below
"""
def getProducersNum(dataQueried):
    return splitArray(dataQueried['producers'], ", ")

def parseQueriedData(data):
    ret = {}
    for d in data:
        ret[d] = {}
        for k in data[d]:
            if k == 'producers':
                ret[d][k] = splitArray(data[d][k], ", ")
            elif k == 'awards':
                ret[d][k] = splitArray(data[d][k], " ")
            else:
                ret[d][k] = float(data[d][k])
    return ret
    
new_data = queryMutableData("movie_corpus_kaggle_dataset/movie_meta_data.csv", dataQueried, key='title')
new_data = parseQueriedData(new_data)


print(data['A Night at the Roxbury'])

{'imdb_rating': 6, 'metascore': 26, 'imdb_votes': 56537, 'producers': 6, 'awards': 1}


import pickle

with open('data/tmdb_5000_dataset/tmdb_data.pickle', 'rb') as handle:
    tmdb_data = pickle.load(handle)

with open('data/revenue/actor_data.pickle', 'rb') as handle:
    actor_data = pickle.load(handle)

with open('data/revenue/director_data.pickle', 'rb') as handle:
    director_data = pickle.load(handle)

print(tmdb_data['10 Days in a Madhouse'])
print(actor_data['Aamir Khan'])
print(director_data['Christopher Nolan'])

{'movie_id': '345003', 'movie_title': '10 Days in a Madhouse', 'star_cast': ['Caroline Barry', 'Christopher Lambert', 'Kelly LeBrock', 'Julia Chantrey', 'Alexandra Callas', 'Natalia Davidenko', 'Katie Singleton', 'Jessa Campbell', 'Andi Morrow', 'Everette Scott Ortiz', 'Saskia Larsen', 'Talya Mar', 'Susan Goforth', 'David Lee Garver', 'Bob Olin', 'Darlene Sellers', 'Monique Robbins', 'Kaitlin Otoole', 'Darrell Salk', 'Gwyn LaRee', 'Michael Swanson', 'Corrina Cornforth'], 'budget': 1200000, 'overview': "Nellie Bly, a 23 year-old reporter for Joseph Pulitzer, goes undercover in the notorious Blackwell's Island women's insane asylum in order to expose corruption, abuse and murder.", 'popularity_score': 0.489271, 'production_companies': [], 'revenue': 0}
['4', '$106,716,335']
['13', '$381,193,202']


star_count = 0
in_db = 0
w_star = 0
movies = 0
temp_data = data.copy()
for i in temp_data:
    if i in tmdb_data:
        movies += 1
        data[i]['budget'] = tmdb_data[i]['budget'] 
        data[i]['revenue'] = tmdb_data[i]['revenue']
        data[i]['popularity_score'] = tmdb_data[i]['popularity_score']
        data[i]['overview'] = tmdb_data[i]['overview']

        highest_star = 0
        for star in tmdb_data[i]['star_cast']:
            star_count += 1
            if star in actor_data:
                in_db += 1
                avg_salary = int(actor_data[star][1].replace(',', '').replace('$', ''))
                num_movies = int(actor_data[star][0])
                highest_star = int(max(highest_star, avg_salary / num_movies))
        if highest_star == 0:
            w_star += 1
            del data[i]

        else:
            data[i]['highest_star_revenue'] = highest_star

    else:
        del data[i]

print(f"Number of movies in both datasets:{movies}")
print(f"Stars in movie dataset:{star_count}\nSubset of stars present in actor dataset:{in_db}")
print(f"Number of movies without a single actor in dataset: {w_star}")
print(f"Usable data : {len(data)}")

Number of movies in both datasets:1356
Stars in movie dataset:39216
Subset of stars present in actor dataset:11665
Number of movies without a single actor in dataset: 9
Usable data : 1347


print(data['2012'])

{'imdb_rating': 5, 'metascore': 49, 'imdb_votes': 350359, 'producers': 10, 'awards': 1, 'budget': 200000000, 'revenue': 769653595, 'popularity_score': 45.274225, 'overview': 'Dr. Adrian Helmsley, part of a worldwide geophysical team investigating the effect on the earth of radiation from unprecedented solar storms, learns that the earth\'s core is heating up. He warns U.S. President Thomas Wilson that the crust of the earth is becoming unstable and that without proper preparations for saving a fraction of the world\'s population, the entire race is doomed. Meanwhile, writer Jackson Curtis stumbles on the same information. While the world\'s leaders race to build "arks" to escape the impending cataclysm, Curtis struggles to find a way to save his family. Meanwhile, volcanic eruptions and earthquakes of unprecedented strength wreak havoc around the world.', 'highest_star_revenue': 83508259}


import numpy as np
from src.PCA import PCA

movieNames = list(data.keys())
dataArr = [data[movieName] for movieName in movieNames]

xLabels = ['producers', 'budget', 'highest_star_revenue']
yLabels = [a for a in dataArr[0].keys() if a not in xLabels and a != 'overview']

xFeatures = np.array([[d[a] for a in xLabels] for d in dataArr])
yFeatures = np.array([[d[a] for a in yLabels] for d in dataArr])
print(yLabels)
print(yFeatures)

print("3 features:")
print(xFeatures[:3])
#print(yFeatures[:10])

reducedXFeatures, xModel = PCA.reduceDimension(xFeatures, 2)
print()
print("2 reduced features:")
print(reducedXFeatures[:3])

print()
print("3 features (approximated from reduced features):")
print(PCA.returnDimension(reducedXFeatures, xModel)[:3])

['imdb_rating', 'metascore', 'imdb_votes', 'awards', 'revenue', 'popularity_score']
[[6.00000000e+00 2.60000000e+01 5.65370000e+04 1.00000000e+00
  3.03311650e+07 1.27092270e+01]
 [6.00000000e+00 4.00000000e+01 1.29220000e+04 1.00000000e+00
  0.00000000e+00 5.77872400e+00]
 [8.00000000e+00 6.90000000e+01 1.25114400e+06 1.00000000e+00
  1.51955791e+09 1.44448633e+02]
 ...
 [6.00000000e+00 4.20000000e+01 2.15514000e+05 1.00000000e+00
  1.03039258e+08 5.89913880e+01]
 [7.00000000e+00 6.10000000e+01 5.92815000e+05 4.00000000e+00
  7.09709780e+08 2.54684930e+01]
 [6.00000000e+00 3.60000000e+01 5.18710000e+04 1.00000000e+00
  3.42272980e+07 2.48211380e+01]]
3 features:
[[        6  17000000  27508687]
 [        3         0  89620264]
 [        8 220000000  70613951]]

2 reduced features:
[[-5.74412908e+07 -1.95736152e+07]
 [ 2.53931357e+06 -4.30081003e+07]
 [ 6.76287806e+06  1.77770978e+08]]

3 features (approximated from reduced features):
[[7.27982453e+00 1.70000000e+07 2.75086870e+07]
 [7.05731520e+00 5.21540642e-08 8.96202640e+07]
 [9.32942022e+00 2.20000000e+08 7.06139510e+07]]


#import numpy as np
#from sklearn.linear_model import LinearRegression
from src.linearRegression import linearRegression

#REDUCED FEATURE LEARNING (Vittorio Corbo 2022 [brought to you from Tokyo japan, kawai XD])

X = reducedXFeatures
y = yFeatures

#print(y.shape)

#global learning
print("global learning:")
reg = linearRegression.linearRegression(X, y)
print()
#select feature learning
for i in range(y.shape[1]):
    print("label:",i,yLabels[i])
    y_prime = yFeatures[:,i]
    reg = linearRegression.linearRegression(X, y_prime)
    print()

global learning:
R^2 score: 0.14801742877413732

label: 0 imdb_rating
R^2 score: 0.0031848902798871093

label: 1 metascore
R^2 score: 0.008595016025802593

label: 2 imdb_votes
R^2 score: 0.11691434914755117

label: 3 awards
R^2 score: 0.07466758239374383

label: 4 revenue
R^2 score: 0.5078484708793156

label: 5 popularity_score
R^2 score: 0.1768942639184985


#Using non-Reduced Features
X = xFeatures
y = yFeatures

#print(y.shape)

#global learning
print("global learning:")
reg = linearRegression.linearRegression(X, y)
print()
#select feature learning
for i in range(y.shape[1]):
    print("label:",i,yLabels[i])
    y_prime = yFeatures[:,i]
    reg = linearRegression.linearRegression(X, y_prime)
    print()

global learning:
R^2 score: 0.15004604407682512

label: 0 imdb_rating
R^2 score: 0.0033368656498697913

label: 1 metascore
R^2 score: 0.011772564300144506

label: 2 imdb_votes
R^2 score: 0.11787092665256937

label: 3 awards
R^2 score: 0.07617886955216091

label: 4 revenue
R^2 score: 0.5116216387074242

label: 5 popularity_score
R^2 score: 0.1794953995987596


import matplotlib.pyplot as plt
import sklearn.metrics as metrics
X = PCA.returnDimension(reducedXFeatures, xModel)

y = yFeatures

N = np.shape(X)[0]
numTest = int(0.1 * N)

reg = linearRegression.linearRegression(X[numTest + 1:, 0:1], y[numTest + 1:,4])
pred = reg.predict(X[:numTest, 0:1])
rmse = metrics.mean_squared_error(y[ :numTest,4],pred, squared = False)
print("Normalized RMSE:", rmse/(np.max(pred) - np.min(pred)))
print("Mean Squared Error:", metrics.mean_squared_error(y[ :numTest,4],pred, squared = True))
print("Average revenue per producers", reg.coef_)
line = plt.plot(X[:numTest, 0:1], pred)
thing = plt.scatter(X[numTest + 1:, 0:1],y[numTest + 1:,4], color="green")
plt.legend()
plt.show()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

R^2 score: 0.4236563610684021
Normalized RMSE: 0.29445077965965505
Mean Squared Error: 5.4919648213861304e+16
Average revenue per producers [2.99456329e+08]


reg = linearRegression.linearRegression(X[numTest + 1:, 1:2], y[numTest + 1:,4])
pred = reg.predict(X[:numTest, 1:2])
rmse = metrics.mean_squared_error(y[ :numTest,4],pred, squared = False)
print("Normalized RMSE:", rmse/(np.max(pred) - np.min(pred)))
print("Mean Squared Error:", metrics.mean_squared_error(y[ :numTest,4],pred, squared = True))
print("Average number of revenue per $1 in budget", reg.coef_)

lines =plt.plot(X[:numTest, 1:2], pred)
plt.scatter(X[numTest + 1:, 1:2],y[numTest + 1:,4], color="green")

plt.show()

R^2 score: 0.4912014598650998
Normalized RMSE: 0.290780439097314
Mean Squared Error: 4.779225354820937e+16
Average number of revenue per $1 in budget [3.17223404]


reg = linearRegression.linearRegression(X[numTest + 1:, 2:], y[numTest + 1:,4])
pred = reg.predict(X[:numTest, 2:])
rmse = metrics.mean_squared_error(y[ :numTest,4],pred, squared = False)
print("Normalized RMSE:", rmse/(np.max(pred) - np.min(pred)))
print("Mean Squared Error:", metrics.mean_squared_error(y[ :numTest,4],pred, squared = True))
print("Ratio of Revenue earned: highest Paid Actor", reg.coef_)
plt.plot(X[:numTest, 2:], pred, color="green")
plt.scatter(X[numTest + 1:, 2:],y[numTest + 1:,4])
plt.show()

R^2 score: 0.10973223694468126
Normalized RMSE: 0.6573184514449396
Mean Squared Error: 7.482833295461304e+16
Ratio of Revenue earned: highest Paid Actor [0.48788192]


from src.randomForest import randomForest
from sklearn import metrics

X = xFeatures
y = yFeatures

print(y.shape)
print(X.shape)
print(xLabels)
print(yLabels)

for i in range(y.shape[1]):
    print(f"Training model to predict {yLabels[i]}")
    reg = randomForest.randomForest(X, y[:,i])
    print('\n')

(1347, 6)
(1347, 3)
['producers', 'budget', 'highest_star_revenue']
['imdb_rating', 'metascore', 'imdb_votes', 'awards', 'revenue', 'popularity_score']
Training model to predict imdb_rating
Training Score : 0.8502414953524309
Test score: -0.042033993840185246


Training model to predict metascore
Training Score : 0.858252230563187
Test score: 0.16993996835752878


Training model to predict imdb_votes
Training Score : 0.8578800992679925
Test score: 0.057844445677811995


Training model to predict awards
Training Score : 0.8530424851208551
Test score: -0.08107812499999967


Training model to predict revenue
Training Score : 0.9247816748852608
Test score: 0.6086262053343992


Training model to predict popularity_score
Training Score : 0.8542878747255873
Test score: 0.11783838849212125


#revenue usng forest regressor and plotting first 50

from sklearn.model_selection import train_test_split

print(f"Training model to predict {yLabels[4]}")
reg = randomForest.randomForest(X, y[:,4])
print('\n')

x_train, x_test, y_train, y_test = train_test_split(xFeatures, yFeatures, random_state=1, test_size = 0.1)

y_rev = y_test[:, 4]

pred_rev = reg.predict(x_test)

# print(y_imdb)
# print(pred_imdb)

plt.figure()
plt.plot(y_rev[0:50], color = 'blue')
plt.plot(pred_rev[0:50], color = 'red')
plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.show()

print("Revenue Normalized MSE:", metrics.mean_squared_error(y_rev, pred_rev, squared=False))
print("Revenue R^2 Error:", metrics.r2_score(y_rev, pred_rev))
print("Revenue Normalized MSE:", metrics.mean_squared_error(y_rev, pred_rev, squared=False) /(np.max(pred_rev) - np.min(pred_rev)))

Training model to predict revenue
Training Score : 0.9214863716632893
Test score: 0.6171737156621906

Revenue Normalized MSE: 128728640.45357291
Revenue R^2 Error: 0.6171737156621906
Revenue Normalized MSE: 0.11627703683729285


#metascore rating usng forest regressor and plotting next 50

from sklearn.model_selection import train_test_split

print(f"Training model to predict {yLabels[1]}")
reg = randomForest.randomForest(X, y[:,1])
print('\n')

x_train, x_test, y_train, y_test = train_test_split(xFeatures, yFeatures, random_state=1, test_size = 0.1)

y_meta = y_test[:, 1]

pred_meta = reg.predict(x_test)

plt.figure()
plt.plot(y_meta[0:50], color = 'blue')
plt.plot(pred_meta[0:50], color = 'red')
plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.show()

print("Metascore Normalized MSE:", metrics.mean_squared_error(y_meta, pred_meta, squared=False))
print("Metascore R^2 Error:", metrics.r2_score(y_meta, pred_meta))
print("Metascore Normalized MSE:", metrics.mean_squared_error(y_meta, pred_meta, squared=False) /(np.max(pred_meta) - np.min(pred_meta)))

Training model to predict metascore
Training Score : 0.8603002819441713
Test score: 0.12442883561610174

Metascore Normalized MSE: 21.154185066359457
Metascore R^2 Error: 0.12442883561610174
Metascore Normalized MSE: 0.2392736688876762


#imdb usng forest regressor and plotting  50
from sklearn.model_selection import train_test_split

print(f"Training model to predict {yLabels[0]}")
reg = randomForest.randomForest(X, y[:,0])
print('\n')

x_train, x_test, y_train, y_test = train_test_split(xFeatures, yFeatures, random_state=1, test_size = 0.1)

y_imdb = y_test[:, 0]

pred_imdb = reg.predict(x_test)

plt.figure()
plt.plot(y_meta[0:50], color = 'blue')
plt.plot(pred_meta[0:50], color = 'red')
plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.show()

print("IMDB Normalized MSE:", metrics.mean_squared_error(y_imdb, pred_imdb, squared=False))
print("IMDB R^2 Error:", metrics.r2_score(y_imdb, pred_imdb))
print("IMDB Normalized MSE:", metrics.mean_squared_error(y_imdb, pred_imdb, squared=False) /(np.max(pred_imdb) - np.min(pred_imdb)))

Training model to predict imdb_rating
Training Score : 0.8474086168833846
Test score: -0.05514071807786558

IMDB Normalized MSE: 1.3042981569811891
IMDB R^2 Error: -0.05514071807786558
IMDB Normalized MSE: 0.1952542151169445


from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

embedder = SentenceTransformer('all-MiniLM-L6-v2')

print(len(data))

temp_x = [i for i in data if data[i]['revenue'] > 10000000]

print(len(temp_x))

x_train = list(temp_x)[:int(len(temp_x) * 0.8)]
x_test = list(temp_x)[int(len(temp_x) * 0.8):]

y_train = [data[key]['revenue'] for key in x_train]
y_test = [data[key]['revenue'] for key in x_test]

print(f"training data : {len(x_train), len(y_train)}, testing data : {len(x_test), len(y_test)}")

e_map = {}

try:
    with open('sentence_encoding.pickle', 'rb') as handle:
        e_map = pickle.load(handle)

except:
    for i in x_train:
        e_map[i] = embedder.encode([data[i]['overview']])[0]

    with open('sentence_encoding.pickle', 'wb') as handle:
        pickle.dump(e_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Perform kmean clu stering
num_clusters = 30
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(list(e_map.values()))
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(list(e_map.keys())[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

1347
1107
training data : (885, 885), testing data : (222, 222)
Cluster  1
['The World Is Not Enough', 'The Abyss', 'Avatar', 'Deep Rising', 'The Hunt for Red October', 'Master and Commander: The Far Side of the World', "Ocean's Twelve", 'Paul', 'Pirates of the Caribbean: The Curse of the Black Pearl', 'The Rock', 'Serenity', 'Sphere', 'Three Kings', 'Sanctum', 'Ghost Ship']

Cluster  2
['The Deep End of the Ocean', 'The Exorcist', 'Practical Magic', 'A Nightmare on Elm Street 5: The Dream Child', 'Bad Teacher', 'Annabelle', 'The Box', 'Carrie', 'Case 39', 'Changeling', 'Colombiana', 'Coraline', 'Cruel Intentions', 'Date Night', 'Drag Me to Hell', 'Drive Angry', 'Eastern Promises', 'Easy A', 'Erin Brockovich', 'Eternal Sunshine of the Spotless Mind', 'The Fault in Our Stars', 'Final Destination 2', 'Gothika', 'The Grudge', 'Hanna', "Jennifer's Body", 'Juno', 'The Long Kiss Goodnight', 'The Next Three Days', "One Flew Over the Cuckoo's Nest", 'Peggy Sue Got Married', 'Precious', 'Prom Night', 'The Roommate', 'Stir of Echoes', 'The Rage: Carrie 2', 'What Lies Beneath', 'When a Stranger Calls', 'End of Days', 'The Boss', 'The Conspirator', 'The Debt', 'The Howling', 'The Phantom of the Opera', 'Labor Day', 'Duplex', 'Flightplan', 'Good Luck Chuck', 'A Nightmare on Elm Street', 'Philomena', 'The Scarlet Letter', "Winter's Bone", 'Atonement', "Pan's Labyrinth"]

Cluster  3
['The Avengers', 'Malcolm X', 'Hollow Man', 'Ali', 'Apocalypse Now', 'Austin Powers: The Spy Who Shagged Me', 'The Black Dahlia', 'Broken Arrow', 'Confessions of a Dangerous Mind', 'Dances with Wolves', 'Despicable Me 2', 'G.I. Joe: The Rise of Cobra', 'Hard Rain', 'Hellboy', 'The Ides of March', 'Inglourious Basterds', 'Kung Fu Panda', 'The Last Samurai', 'Lord of War', 'The Losers', 'Mad Max 2: The Road Warrior', 'The Manchurian Candidate', 'The Mask', 'Public Enemies', 'Straight Outta Compton', 'Valkyrie', 'Wild Wild West', 'xXx', 'Boyz n the Hood', 'J. Edgar', 'The Naked Gun 2Â½: The Smell of Fear', 'Patton', 'Rambo: First Blood Part II', 'RED', 'Iron Man', 'American Outlaws', 'Butch Cassidy and the Sundance Kid', 'Courage Under Fire', 'The Four Feathers', 'Glory Road', 'The Thin Red Line', 'The Incredible Hulk', '47 Ronin', 'Despicable Me']

Cluster  4
['A Night at the Roxbury', 'The Change-Up', 'Gremlins', 'Hall Pass', 'The Hangover', 'Happy Feet', 'High Fidelity', 'Horrible Bosses', 'Legend', 'Scott Pilgrim vs. the World', 'Speed Racer', 'This Is 40', 'Disturbing Behavior', "A Hard Day's Night", 'Talladega Nights: The Ballad of Ricky Bobby', 'The Flintstones', 'Poltergeist', 'Stuart Little 2']

Cluster  5
['The Blair Witch Project', 'AlienÂ³', 'The Boxtrolls', 'Chronicle', 'Ex Machina', 'Fantastic Four', 'The Fifth Element', 'Final Destination', 'I Am Number Four', 'Monte Carlo', 'Moonrise Kingdom', 'Mud', 'The Pacifier', 'Pandorum', 'Panic Room', 'White Squall', 'Wild Hogs', 'As Above, So Below', 'Scooby-Doo', 'Sorority Row', 'The Crazies', 'The Fog', 'The Mist', 'Timeline', 'The Faculty', 'Jeepers Creepers', 'E.T. the Extra-Terrestrial']

Cluster  6
['Fantasia 2000', 'The Princess Bride', '12 Years a Slave', 'Aladdin', 'Anastasia', 'Antz', 'The Lord of the Rings: The Fellowship of the Ring', 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe', 'Clash of the Titans', 'Conan the Barbarian', 'Dragonslayer', 'Frozen', 'Hellboy II: The Golden Army', 'How to Train Your Dragon 2', 'How to Train Your Dragon', 'Into the Woods', 'The Lord of the Rings: The Return of the King', 'The Lord of the Rings: The Two Towers', 'Mulan', 'The Prestige', 'Queen of the Damned', 'Rise of the Guardians', 'Shrek the Third', 'Shrek', 'Snow White and the Huntsman', 'The Hobbit: An Unexpected Journey', 'Thor', 'The Three Musketeers', 'The Wizard of Oz', 'Alice in Wonderland', 'Big Trouble in Little China', 'Percy Jackson & the Olympians: The Lightning Thief', 'Prince of Persia: The Sands of Time', 'The NeverEnding Story', 'The Smurfs', "The Sorcerer's Apprentice", 'Time Bandits', 'Your Highness', 'Beowulf', 'Highlander: The Final Dimension', 'Highlander: Endgame', 'Kingdom of Heaven', 'The Lion King', 'Chocolat']

Cluster  7
['The Addams Family', 'Resident Evil: Afterlife', 'Batman Returns', 'Beasts of the Southern Wild', 'Fantastic Mr. Fox', 'Finding Nemo', 'Frankenweenie', 'Gremlins 2: The New Batch', 'The Haunting', 'Hellraiser', 'Indiana Jones and the Last Crusade', 'Land of the Dead', 'ParaNorman', 'Pet Sematary', 'Priest', 'Psycho', 'Red Riding Hood', 'TMNT', 'Witness', '10 Cloverfield Lane', 'Indiana Jones and the Kingdom of the Crystal Skull', 'Night at the Museum: Battle of the Smithsonian', 'Paranormal Activity', 'We Bought a Zoo', 'Beetlejuice', 'Batman Forever']

Cluster  8
['A Nightmare on Elm Street 3: Dream Warriors', 'A Nightmare on Elm Street 4: The Dream Master', "Freddy's Dead: The Final Nightmare", 'Halloween', 'Halloween: Resurrection', 'Blade: Trinity', 'Blade', 'Dawn of the Dead', 'The Evil Dead', 'Fright Night', 'Insidious', 'Super 8', '1408', '30 Days of Night', 'Black Christmas', 'Frailty', 'Blade II', 'Halloween 4: The Return of Michael Myers', 'Twilight Zone: The Movie', 'House on Haunted Hill', 'Scary Movie 2', 'The Sixth Sense', 'Let Me In']

Cluster  9
['Amour', 'Blue Valentine', 'Cars 2', 'Chasing Amy', 'The Deer Hunter', 'Dumb and Dumber', 'Love & Basketball', "My Best Friend's Wedding", 'Orphan', 'Runaway Bride', "Something's Gotta Give", 'Water for Elephants', 'Unfaithful', 'Lars and the Real Girl', 'Alfie', 'Catwoman', 'Pearl Harbor', 'The Wedding Date', 'When Harry Met Sally...', 'Enough Said']

Cluster  10
['Alien', 'Jason X', 'King Kong', '2012', 'District 9', 'Alien: Resurrection', 'Aliens', 'Angels & Demons', 'Armageddon', 'The Book of Eli', 'The Croods', 'The Day the Earth Stood Still', 'Dune', 'Edward Scissorhands', 'Flash Gordon', 'Gattaca', 'Gravity', 'The Hills Have Eyes', "The Hitchhiker's Guide to the Galaxy", 'Raiders of the Lost Ark', 'Indiana Jones and the Temple of Doom', 'Interstellar', 'The Island', 'Legion', 'Life of Pi', 'Lost in Space', 'Mission to Mars', 'The Postman', 'Prometheus', 'The Road', "Schindler's List", 'Signs', 'Starship Troopers', 'The Theory of Everything', 'The Thing', 'Year One', 'Escape from the Planet of the Apes', 'In Bruges', 'Piranha 3D', 'Resident Evil: Extinction', 'Skyline', 'The Mothman Prophecies', 'The Time Machine', 'Planet of the Apes', 'Beneath the Planet of the Apes', 'Children of Men', 'Close Encounters of the Third Kind', 'Cloud Atlas', "Ender's Game", 'The English Patient', 'The Island of Dr. Moreau', 'Out of Africa', 'Seven Years in Tibet', 'Space Cowboys', 'Battlefield Earth', 'Elysium']

Cluster  11
['The Running Man', 'The American', 'Assassins', 'Blade Runner', 'Crank', 'The Crow', 'The Dark Knight Rises', 'Deadpool', 'Django Unchained', 'Flight', 'Gangs of New York', 'Get Carter', 'The Ghost and the Darkness', 'Gladiator', 'Good Will Hunting', 'JFK', 'The Life of David Gale', 'Machete', 'Meet Joe Black', 'Ninja Assassin', 'No Country for Old Men', 'Phone Booth', "Pirates of the Caribbean: Dead Man's Chest", 'The Revenant', 'Robin Hood: Prince of Thieves', 'Saw', 'Scream 2', 'Scream 3', 'Se7en', 'Suspect Zero', 'Wanted', 'Watchmen', 'The Punisher', 'Urban Legend', 'Zodiac', '3:10 to Yuma', 'Hero', 'Kill Bill: Vol. 2', 'Kill Bill: Vol. 1', "Monster's Ball", 'Vantage Point', 'Batman & Robin', 'Batman Begins', 'Edge of Tomorrow']

Cluster  12
['The Crying Game', 'The Constant Gardener', 'A Few Good Men', 'Barry Lyndon', 'The Bodyguard', 'Cradle 2 the Grave', 'Fair Game', 'The Fugitive', 'Liar Liar', 'Lone Star', 'Man on Fire', 'The Men Who Stare at Goats', 'Mission: Impossible', 'Out of Sight', 'Sherlock Holmes', 'Sicario', 'The Silence of the Lambs', 'Skyfall', 'Source Code', 'Tombstone', 'Whiteout', 'The Bourne Supremacy', 'From Hell', 'Payback', 'Salt', 'Thank You for Smoking', 'The Guard', 'The Jackal', 'Unforgiven', 'White House Down', '16 Blocks', 'The Adventures of Ford Fairlane', 'Enemy of the State', 'Goldfinger', 'I Think I Love My Wife', 'In the Valley of Elah', 'John Q', 'Minority Report', 'Mission: Impossible II', 'Street Kings', 'Prisoners', 'Training Day', 'Fast Five']

Cluster  13
['127 Hours', 'The Adjustment Bureau', 'Amadeus', 'American Beauty', 'Big Fish', 'Big', 'Boyhood', 'The Butterfly Effect', 'The Curious Case of Benjamin Button', 'Dark City', 'Forrest Gump', 'The Game', 'Ghost Rider', 'Her', 'Hot Tub Time Machine', 'How to Lose Friends & Alienate People', 'The Invention of Lying', 'The Jacket', 'Larry Crowne', 'The Perks of Being a Wallflower', 'Rise of the Planet of the Apes', 'Room', 'The Secret Life of Walter Mitty', 'Unbreakable', 'Up', 'A Christmas Carol', 'About Time', 'Back to the Future Part II', 'Click', 'The Village', 'Barbershop', 'Elf', 'Frequency', 'Hook', 'Hustle & Flow', "Mr. Holland's Opus", 'Rent', 'Stranger Than Fiction', 'A Christmas Story', 'Back to the Future']

Cluster  14
['The Big Lebowski', 'The Life Aquatic with Steve Zissou', '42', 'A Serious Man', 'American Hustle', 'As Good as It Gets', 'Being John Malkovich', 'Burn After Reading', 'The Devil Wears Prada', 'The Doors', 'The Elephant Man', 'Fast Times at Ridgemont High', 'Funny People', 'Get on Up', 'Groundhog Day', 'Harold & Kumar Go to White Castle', 'The Insider', 'Jay and Silent Bob Strike Back', 'Jerry Maguire', 'Man on the Moon', 'Moneyball', 'My Week with Marilyn', 'The Pianist', 'The Producers', 'Syriana', 'The Ugly Truth', 'Up in the Air', 'Cop Land', 'Diary of a Wimpy Kid', 'Sausage Party', 'Scrooged', 'Last Action Hero', 'Looney Tunes: Back in Action', 'Quartet', 'All That Jazz']

Cluster  15
['15 Minutes', '30 Minutes or Less', 'Absolute Power', 'Bad Santa', 'Black Rain', 'Cellular', 'The Departed', 'Die Hard', 'Donnie Brasco', 'Drive', 'Eagle Eye', 'Entrapment', 'From Dusk Till Dawn', 'Hostage', 'Insomnia', 'The Italian Job', 'Jackie Brown', 'L.A. Confidential', 'Law Abiding Citizen', 'Looper', 'Max Payne', 'Mirrors', "Ocean's Eleven", 'Office Space', 'Pineapple Express', 'Reindeer Games', 'The Relic', 'Ronin', 'Snatch', 'Identity', 'Ishtar', 'Taken', 'The Town', 'Tower Heist', 'Collateral', 'Fun with Dick and Jane', 'Inside Man', 'The Place Beyond the Pines', 'Seven Psychopaths', 'Speed', 'True Romance']

Cluster  16
['Chill Factor', 'Airplane!', 'Argo', 'The Boondock Saints II: All Saints Day', 'Collateral Damage', 'Die Hard 2', 'Do the Right Thing', 'Escape from L.A.', 'Escape from New York', 'Face/Off', 'Jaws 2', 'Jaws', 'The Kingdom', 'Men in Black', 'Oblivion', 'Predator', 'Rush Hour 2', 'The Siege', 'Thirteen Days', 'Thunderbirds', 'Tomorrow Never Dies', 'Unknown', 'Con Air', 'Predator 2', 'Rendition', 'The Expendables', 'The Happening', 'Black Hawk Down', 'Air Force One', 'Conspiracy Theory', 'Executive Decision', 'Good Night, and Good Luck.', 'Munich', 'Olympus Has Fallen', 'Outbreak', 'Resident Evil', 'The X Files', 'Bridge of Spies', 'Chain Reaction']

Cluster  17
['17 Again', 'American History X', 'The Blind Side', 'The Bounty Hunter', 'Breakdown', 'The Cable Guy', 'Cast Away', "Cirque du Freak: The Vampire's Assistant", 'Disturbia', 'Extract', 'I Love You Phillip Morris', 'Milk', 'Next Friday', 'Observe and Report', 'Semi-Pro', "She's Out of My League", 'War of the Worlds', 'Wild Things', 'The Wrestler', 'Days of Thunder', '8 Mile', 'Ace Ventura: Pet Detective', 'Anchorman: The Legend of Ron Burgundy', 'The Girl Next Door', 'Elizabethtown', 'The Karate Kid', 'Little Black Book', 'Mean Girls', 'What Women Want', 'Any Given Sunday', 'Crazy Heart']

Cluster  18
['Halloween II', 'The American President', 'Anna Karenina', 'Cold Mountain', 'Crazy, Stupid, Love.', 'The Descendants', 'Garden State', "It's Complicated", 'Lost in Translation', 'Major League', 'The Proposal', 'Silver Linings Playbook', 'Walking Tall', 'Yes Man', 'Scream 4', 'The Bridges of Madison County', 'Young Adult', 'About Schmidt', 'Cheaper by the Dozen', 'The Best Exotic Marigold Hotel']

Cluster  19
['American Psycho', 'Arbitrage', "The Devil's Advocate", 'Glengarry Glen Ross', 'The Good Girl', 'Inception', 'The Informant!', 'The Lincoln Lawyer', 'The Mechanic', 'Philadelphia', 'The Social Network', 'The Verdict', 'Wall Street: Money Never Sleeps', 'Wall Street', 'The Wolf of Wall Street', 'Repo Men', 'The Apartment', 'Boiler Room', 'Casino Royale', 'Network']

Cluster  20
['8MM', 'Antitrust', 'Commando', 'Fight Club', 'Gamer', 'The Girl with the Dragon Tattoo', 'Margin Call', 'The Matrix Reloaded', 'The Matrix', 'Point Break', 'Surrogates', 'Swordfish', 'Total Recall', 'Live Free or Die Hard', 'The Matrix Revolutions', 'WarGames', 'The Thirteenth Floor', 'From Russia with Love', 'Hitman', 'The Ninth Gate', 'You Only Live Twice', 'The Saint', 'Untraceable']

Cluster  21
['An Education', 'Annie Hall', 'Big Eyes', 'Black Swan', 'Bridesmaids', 'Burlesque', "Charlie's Angels", 'Drop Dead Gorgeous', 'Enough', 'The Help', 'The Imaginarium of Doctor Parnassus', 'The Kids Are All Right', 'Legally Blonde', 'Notting Hill', 'Pretty Woman', 'Rush', 'Saving Mr. Banks', 'Shakespeare in Love', 'Titanic', 'The Tourist', 'Secretariat', 'The House Bunny', 'The Sound of Music', "The Time Traveler's Wife", 'The Young Victoria', 'AmÃ©lie', 'Gosford Park', 'Almost Famous', 'Girl with a Pearl Earring', 'Jaws: The Revenge', 'Little Miss Sunshine', 'A Mighty Heart', 'Some Like It Hot', "Coal Miner's Daughter", 'For Colored Girls']

Cluster  22
['Jason Goes to Hell: The Final Friday', 'Friday the 13th Part 2', 'Friday the 13th Part VI: Jason Lives', 'Friday the 13th: The Final Chapter', 'The Bourne Identity', 'The Bourne Ultimatum', 'Buried', 'Cliffhanger', "National Lampoon's Vacation", 'Friday the 13th Part VIII: Jason Takes Manhattan', 'Ghost', 'The Green Mile', 'I Still Know What You Did Last Summer', 'The Martian', 'The Master', 'The Reader', 'Saving Private Ryan', 'Sling Blade', 'Robin Hood', "The General's Daughter", 'Halloween: The Curse of Michael Myers', 'Dear John', 'Secret Window', 'Sweet November']

Cluster  23
['The Godfather', 'The Godfather: Part II', 'Stigmata', 'American Gangster', 'Analyze That', 'The Fighter', 'The French Connection', 'From Here to Eternity', 'Only God Forgives', 'Pulp Fiction', 'Raging Bull', 'Rocky', 'Scarface', 'The Shawshank Redemption', 'The Talented Mr. Ripley', 'War Horse', 'Warrior', 'We Own the Night', 'Born on the Fourth of July', "The Devil's Own", 'Million Dollar Baby', 'Rocky Balboa']

Cluster  24
['Napoleon Dynamite', '10 Things I Hate About You', 'All About Steve', 'American Graffiti', "He's Just Not That Into You", 'No Strings Attached', 'Superbad', 'Tin Cup', 'Sex Drive', 'The Switch', "Valentine's Day", 'Notes on a Scandal', 'Before Sunset', "There's Something About Mary", 'Before Midnight']

Cluster  25
['Spider-Man', 'Austin Powers: International Man of Mystery', 'Hancock', 'I, Robot', 'The Lego Movie', "Logan's Run", 'Megamind', 'Men in Black 3', 'Small Soldiers', 'Star Wars: Episode II - Attack of the Clones', 'Star Wars: Episode I - The Phantom Menace', 'Terminator 2: Judgment Day', 'Terminator Salvation', 'The Terminator', 'After Earth', 'Green Lantern', 'Terminator 3: Rise of the Machines', 'X-Men Origins: Wolverine', 'K-PAX', 'RoboCop', 'Star Wars: Episode III - Revenge of the Sith', 'Superman II', 'Superman III', 'Superman IV: The Quest for Peace', 'Superman']

Cluster  26
['The Lost World: Jurassic Park', 'Jurassic Park', 'Lake Placid', 'Jurassic World', 'Night at the Museum', 'Super Mario Bros.', 'Monsters, Inc.', 'Doctor Zhivago', 'Jurassic Park III']

Cluster  27
['Angel Eyes', 'Babel', 'Crash', 'House of 1000 Corpses', 'Intolerable Cruelty', 'Magnolia', 'Marley & Me', 'Midnight in Paris', 'Pride & Prejudice', 'Revolutionary Road', 'Romeo + Juliet', 'Sense and Sensibility', 'Spanglish', '21 Grams', 'Another Year', 'The Horse Whisperer', 'Casablanca', 'The Fisher King', 'Little Children', 'Memoirs of a Geisha', 'A Perfect Getaway', 'Vicky Cristina Barcelona', "Rosemary's Baby", 'Far from Heaven']

Cluster  28
['A Most Violent Year', 'Alone in the Dark', 'Analyze This', 'Boogie Nights', 'Bruce Almighty', 'Casino', 'Catch Me If You Can', 'Dallas Buyers Club', 'Fear and Loathing in Las Vegas', 'Fruitvale Station', 'Go', 'The Great Gatsby', 'Grosse Pointe Blank', 'Into the Wild', 'Knocked Up', 'Les MisÃ©rables', 'Midnight Cowboy', 'Nine', 'Taxi Driver', 'Trainspotting', 'Vanilla Sky', 'The Terminal', 'This Is the End', '25th Hour', 'Blast from the Past', 'The Diving Bell and the Butterfly', "Get Rich or Die Tryin'", 'A Good Year', 'The Lost Weekend', 'Leaving Las Vegas']

Cluster  29
['Spaceballs', 'Event Horizon', 'Pitch Black', 'Star Trek: First Contact', 'Star Trek: Generations', 'Star Trek II: The Wrath of Khan', 'Star Trek: Nemesis', 'Star Trek', 'Planet 51', 'Star Trek VI: The Undiscovered Country', 'Independence Day', 'Star Trek III: The Search for Spock', 'Star Trek IV: The Voyage Home', 'Star Trek V: The Final Frontier', 'Star Trek: Insurrection', 'Star Trek: The Motion Picture']

Cluster  30
['Braveheart', 'The Last of the Mohicans', 'The Messenger: The Story of Joan of Arc', 'Anonymous', 'Dead Poets Society', 'Dogma', 'Elizabeth: The Golden Age', 'Gandhi', 'G.I. Jane', 'Invictus', "The King's Speech", 'Lincoln', 'The Patriot', 'The Queen', 'Traffic', '28 Weeks Later', 'Exorcist: The Beginning', 'Lawrence of Arabia', 'Letters from Iwo Jima', 'Troy', 'V for Vendetta', 'The Birth of a Nation']


from collections import defaultdict

mappings = defaultdict(dict)

for i in range(len(clustered_sentences)):
    for movie in clustered_sentences[i]:
        for label in yLabels:
            mappings[label][i] = mappings[label].get(i, 0) + data[movie][label]
    for label in yLabels:
        mappings[label][i] = mappings[label][i]/len(clustered_sentences[i])


for label in yLabels:
    print(f"{label}")
    print({k: v for k, v in sorted(mappings[label].items(), key=lambda item: item[1], reverse=True)})

imdb_rating
{22: 7.2272727272727275, 29: 6.954545454545454, 18: 6.95, 26: 6.75, 20: 6.6571428571428575, 14: 6.585365853658536, 10: 6.568181818181818, 12: 6.55, 0: 6.533333333333333, 5: 6.5227272727272725, 19: 6.521739130434782, 11: 6.511627906976744, 13: 6.485714285714286, 27: 6.466666666666667, 17: 6.45, 28: 6.4375, 2: 6.363636363636363, 6: 6.346153846153846, 23: 6.333333333333333, 25: 6.333333333333333, 21: 6.291666666666667, 24: 6.24, 15: 6.153846153846154, 9: 6.017857142857143, 16: 6.0, 1: 5.962962962962963, 8: 5.9, 4: 5.888888888888889, 7: 5.608695652173913, 3: 5.555555555555555}
metascore
{22: 71.77272727272727, 29: 70.31818181818181, 18: 69.1, 26: 68.0, 13: 65.4, 20: 65.4, 17: 63.55, 23: 62.8, 27: 61.03333333333333, 6: 60.46153846153846, 5: 60.45454545454545, 12: 59.825, 2: 59.22727272727273, 11: 58.93023255813954, 8: 57.3, 14: 56.53658536585366, 28: 56.3125, 0: 55.8, 24: 55.76, 9: 55.375, 1: 55.2037037037037, 15: 54.17948717948718, 10: 53.45454545454545, 16: 53.45161290322581, 19: 52.30434782608695, 25: 50.77777777777778, 21: 49.625, 3: 49.22222222222222, 7: 48.73913043478261, 4: 47.851851851851855}
imdb_votes
{22: 515949.7272727273, 10: 424739.5227272727, 25: 376706.77777777775, 24: 364022.88, 5: 362801.0681818182, 12: 340546.85, 18: 338513.45, 19: 321424.9130434783, 0: 297774.13333333336, 9: 296324.28571428574, 29: 272464.7727272727, 14: 268751.46341463417, 27: 260319.76666666666, 2: 255879.06818181818, 21: 252417.625, 11: 232421.97674418605, 6: 222940.65384615384, 20: 217075.25714285715, 13: 196429.34285714285, 15: 194200.10256410256, 17: 187720.85, 16: 183627.7741935484, 23: 171595.66666666666, 1: 161614.37037037036, 26: 160571.625, 3: 160286.88888888888, 28: 156789.125, 4: 156419.44444444444, 7: 149241.60869565216, 8: 125400.95}
awards
{0: 1.8, 5: 1.6136363636363635, 24: 1.48, 29: 1.4090909090909092, 15: 1.3846153846153846, 9: 1.375, 21: 1.375, 26: 1.375, 28: 1.375, 16: 1.3548387096774193, 25: 1.3333333333333333, 20: 1.3142857142857143, 18: 1.3, 27: 1.3, 4: 1.2962962962962963, 10: 1.2727272727272727, 7: 1.2608695652173914, 11: 1.2093023255813953, 2: 1.2045454545454546, 12: 1.175, 8: 1.15, 1: 1.1481481481481481, 14: 1.146341463414634, 19: 1.1304347826086956, 6: 1.1153846153846154, 13: 1.0285714285714285, 3: 1.0, 17: 1.0, 22: 1.0, 23: 1.0}
revenue
{25: 484269498.0, 0: 363300703.6666667, 5: 359467640.5, 24: 355977710.12, 2: 225819281.5, 10: 219592406.0, 9: 202689694.0357143, 6: 193954937.3846154, 11: 191145326.4651163, 20: 174853658.68571427, 12: 169447673.375, 19: 166906361.6956522, 21: 164709107.875, 15: 161173959.3846154, 28: 156486793.0625, 18: 149940884.95, 29: 141779915.04545453, 8: 140247419.55, 3: 136053405.1111111, 17: 133302507.7, 4: 126560091.8888889, 16: 119018465.5483871, 14: 111575817.92682926, 27: 110594605.1, 22: 106671979.9090909, 7: 100859043.6521739, 23: 100777022.4, 13: 96225646.51428571, 1: 95796662.4074074, 26: 85588361.41666667}
popularity_score
{25: 73.13332877777779, 10: 60.13670738636363, 0: 59.66313553333334, 5: 59.543805613636344, 9: 54.64372623214286, 22: 51.05131840909092, 24: 50.51996491999999, 12: 44.20950965, 19: 42.762771304347815, 2: 42.76051827272727, 6: 42.55347638461538, 18: 40.282002799999994, 21: 37.529143375, 4: 37.10236970370371, 14: 36.50008470731707, 15: 34.89498387179487, 27: 34.42673523333334, 11: 34.29388260465117, 29: 33.43361127272727, 3: 31.32293883333333, 7: 30.41337647826087, 1: 29.85630872222223, 20: 29.72496308571429, 13: 28.964541828571424, 16: 28.11072774193549, 28: 28.00309025, 17: 27.899624450000005, 23: 26.56532166666667, 26: 23.837679458333337, 8: 21.318576399999998}


rmse = 0

import math
import numpy as np
from collections import defaultdict

pred = defaultdict(list)

fields = defaultdict(int)

for i in x_test:
    movie_encoding = embedder.encode([data[i]['overview']])[0]
    cluster = clustering_model.predict([movie_encoding])[0]
    for label in yLabels:
        fields[label] += math.sqrt((mappings[label][cluster] - data[i][label])**2 / len(x_test))
        pred[label].append(mappings[label][cluster])

for label in fields:
    print(f"{label} NRMSE")
    print(fields[label] / (max([data[i][label] for i in data]) - min([data[i][label] for i in data])))
    print("\n")

imdb_rating NRMSE
1.2036487916723635


metascore NRMSE
2.572173657582724


imdb_votes NRMSE
1.1581800552443429


awards NRMSE
1.2196819745150511


revenue NRMSE
0.8028939854929531


popularity_score NRMSE
0.4950142758816381


import matplotlib.pyplot as plt

line = plt.plot([data[i]['revenue'] for i in x_test]) #[i for i in range(len(x_test))],
plt.plot(pred['revenue'], color="red") #[i for i in range(len(x_test))]
plt.show()


line = plt.plot([i for i in range(len(x_test))], [data[i]['imdb_rating'] for i in x_test])
plt.plot([i for i in range(len(x_test))], [pred['imdb_rating'][i] for i in range(len(x_test))], color="red")
plt.show()


line = plt.plot([i for i in range(len(x_test))], [data[i]['popularity_score'] for i in x_test])
plt.plot([i for i in range(len(x_test))], pred['popularity_score'], color="red")
plt.show()


line = plt.plot([i for i in range(len(x_test))], [data[i]['metascore'] for i in x_test])
plt.plot([i for i in range(len(x_test))], [pred['metascore'][i] for i in range(len(x_test))], color="red")
plt.show()


line = plt.plot([i for i in range(len(x_test))], [data[i]['awards'] for i in x_test])
plt.plot([i for i in range(len(x_test))], [pred['awards'][i] for i in range(len(x_test))], color="red")
plt.show()


line = plt.plot([i for i in range(len(x_test))], [data[i]['imdb_votes'] for i in x_test])
plt.plot([i for i in range(len(x_test))], [pred['imdb_votes'][i] for i in range(len(x_test))], color="red")
plt.show()