Movie Predictor
To predict the success of the movie, we will be doing PCA and linear regression.
Getting data from kaggle dataset movie_meta_data.csv
from src.dataUtil.parseData import splitArray
from src.dataUtil.dataReader import queryData, queryMutableData
%load_ext autoreload
%autoreload 2
"""
Data queried is a dictionary with all the necessary fields stored in arrays
"""
dataQueried = \
{
'imdb user rating':[],
'metascore': [],
'number of imdb user votes':[],
'producers': [],
'awards':[]
}
"""
To get the data, define the diictionary and query
the data with queryData. This will fill the dictionary
with the necessary data.
"""
data = queryData("movie_corpus_kaggle_dataset/movie_meta_data.csv")
"""
To get number of rewards, run the function below
"""
def getRewardsNum(dataQueried):
return splitArray(dataQueried['awards'], " ")
"""
To get number of producers, run the function below
"""
def getProducersNum(dataQueried):
return splitArray(dataQueried['producers'], ", ")
def parseQueriedData(data):
ret = {}
for d in data:
ret[d] = {}
for k in data[d]:
if k == 'producers':
ret[d][k] = splitArray(data[d][k], ", ")
elif k == 'awards':
ret[d][k] = splitArray(data[d][k], " ")
else:
ret[d][k] = float(data[d][k])
return ret
new_data = queryMutableData("movie_corpus_kaggle_dataset/movie_meta_data.csv", dataQueried, key='title')
new_data = parseQueriedData(new_data)
print(data['A Night at the Roxbury'])
{'imdb_rating': 6, 'metascore': 26, 'imdb_votes': 56537, 'producers': 6, 'awards': 1}
Get Data from TMDB 5000 dataset:
import pickle
with open('data/tmdb_5000_dataset/tmdb_data.pickle', 'rb') as handle:
tmdb_data = pickle.load(handle)
with open('data/revenue/actor_data.pickle', 'rb') as handle:
actor_data = pickle.load(handle)
with open('data/revenue/director_data.pickle', 'rb') as handle:
director_data = pickle.load(handle)
print(tmdb_data['10 Days in a Madhouse'])
print(actor_data['Aamir Khan'])
print(director_data['Christopher Nolan'])
{'movie_id': '345003', 'movie_title': '10 Days in a Madhouse', 'star_cast': ['Caroline Barry', 'Christopher Lambert', 'Kelly LeBrock', 'Julia Chantrey', 'Alexandra Callas', 'Natalia Davidenko', 'Katie Singleton', 'Jessa Campbell', 'Andi Morrow', 'Everette Scott Ortiz', 'Saskia Larsen', 'Talya Mar', 'Susan Goforth', 'David Lee Garver', 'Bob Olin', 'Darlene Sellers', 'Monique Robbins', 'Kaitlin Otoole', 'Darrell Salk', 'Gwyn LaRee', 'Michael Swanson', 'Corrina Cornforth'], 'budget': 1200000, 'overview': "Nellie Bly, a 23 year-old reporter for Joseph Pulitzer, goes undercover in the notorious Blackwell's Island women's insane asylum in order to expose corruption, abuse and murder.", 'popularity_score': 0.489271, 'production_companies': [], 'revenue': 0} ['4', '$106,716,335'] ['13', '$381,193,202']
Join/Merge both Datasets:
star_count = 0
in_db = 0
w_star = 0
movies = 0
temp_data = data.copy()
for i in temp_data:
if i in tmdb_data:
movies += 1
data[i]['budget'] = tmdb_data[i]['budget']
data[i]['revenue'] = tmdb_data[i]['revenue']
data[i]['popularity_score'] = tmdb_data[i]['popularity_score']
data[i]['overview'] = tmdb_data[i]['overview']
highest_star = 0
for star in tmdb_data[i]['star_cast']:
star_count += 1
if star in actor_data:
in_db += 1
avg_salary = int(actor_data[star][1].replace(',', '').replace('$', ''))
num_movies = int(actor_data[star][0])
highest_star = int(max(highest_star, avg_salary / num_movies))
if highest_star == 0:
w_star += 1
del data[i]
else:
data[i]['highest_star_revenue'] = highest_star
else:
del data[i]
print(f"Number of movies in both datasets:{movies}")
print(f"Stars in movie dataset:{star_count}\nSubset of stars present in actor dataset:{in_db}")
print(f"Number of movies without a single actor in dataset: {w_star}")
print(f"Usable data : {len(data)}")
Number of movies in both datasets:1356 Stars in movie dataset:39216 Subset of stars present in actor dataset:11665 Number of movies without a single actor in dataset: 9 Usable data : 1347
data: X fields - producers, budget, highest_star_revenue Y fields - imdb_rating, metascore, imdb_votes, awards, popularity_score, revenue
Printing Data:
print(data['2012'])
{'imdb_rating': 5, 'metascore': 49, 'imdb_votes': 350359, 'producers': 10, 'awards': 1, 'budget': 200000000, 'revenue': 769653595, 'popularity_score': 45.274225, 'overview': 'Dr. Adrian Helmsley, part of a worldwide geophysical team investigating the effect on the earth of radiation from unprecedented solar storms, learns that the earth\'s core is heating up. He warns U.S. President Thomas Wilson that the crust of the earth is becoming unstable and that without proper preparations for saving a fraction of the world\'s population, the entire race is doomed. Meanwhile, writer Jackson Curtis stumbles on the same information. While the world\'s leaders race to build "arks" to escape the impending cataclysm, Curtis struggles to find a way to save his family. Meanwhile, volcanic eruptions and earthquakes of unprecedented strength wreak havoc around the world.', 'highest_star_revenue': 83508259}
Reducing and Returning Features
The below code block applies PCA onto our data set by reducing 3 features to 2 to find out which Y feature is the easiest factor to predict in movie success.
import numpy as np
from src.PCA import PCA
movieNames = list(data.keys())
dataArr = [data[movieName] for movieName in movieNames]
xLabels = ['producers', 'budget', 'highest_star_revenue']
yLabels = [a for a in dataArr[0].keys() if a not in xLabels and a != 'overview']
xFeatures = np.array([[d[a] for a in xLabels] for d in dataArr])
yFeatures = np.array([[d[a] for a in yLabels] for d in dataArr])
print(yLabels)
print(yFeatures)
print("3 features:")
print(xFeatures[:3])
#print(yFeatures[:10])
reducedXFeatures, xModel = PCA.reduceDimension(xFeatures, 2)
print()
print("2 reduced features:")
print(reducedXFeatures[:3])
print()
print("3 features (approximated from reduced features):")
print(PCA.returnDimension(reducedXFeatures, xModel)[:3])
['imdb_rating', 'metascore', 'imdb_votes', 'awards', 'revenue', 'popularity_score'] [[6.00000000e+00 2.60000000e+01 5.65370000e+04 1.00000000e+00 3.03311650e+07 1.27092270e+01] [6.00000000e+00 4.00000000e+01 1.29220000e+04 1.00000000e+00 0.00000000e+00 5.77872400e+00] [8.00000000e+00 6.90000000e+01 1.25114400e+06 1.00000000e+00 1.51955791e+09 1.44448633e+02] ... [6.00000000e+00 4.20000000e+01 2.15514000e+05 1.00000000e+00 1.03039258e+08 5.89913880e+01] [7.00000000e+00 6.10000000e+01 5.92815000e+05 4.00000000e+00 7.09709780e+08 2.54684930e+01] [6.00000000e+00 3.60000000e+01 5.18710000e+04 1.00000000e+00 3.42272980e+07 2.48211380e+01]] 3 features: [[ 6 17000000 27508687] [ 3 0 89620264] [ 8 220000000 70613951]] 2 reduced features: [[-5.74412908e+07 -1.95736152e+07] [ 2.53931357e+06 -4.30081003e+07] [ 6.76287806e+06 1.77770978e+08]] 3 features (approximated from reduced features): [[7.27982453e+00 1.70000000e+07 2.75086870e+07] [7.05731520e+00 5.21540642e-08 8.96202640e+07] [9.32942022e+00 2.20000000e+08 7.06139510e+07]]
Now we need to find which y value is the easiest factor to predict in movie success for our X features. We will be using linear regression score from sklearn in order to do a quick review for which fature in Y would be the easiest factor to predict in movie success. We will test the data with and without PCA in order to make sure that this Dimensionality reduction does not impact our indicator's predictability.
#import numpy as np
#from sklearn.linear_model import LinearRegression
from src.linearRegression import linearRegression
#REDUCED FEATURE LEARNING (Vittorio Corbo 2022 [brought to you from Tokyo japan, kawai XD])
X = reducedXFeatures
y = yFeatures
#print(y.shape)
#global learning
print("global learning:")
reg = linearRegression.linearRegression(X, y)
print()
#select feature learning
for i in range(y.shape[1]):
print("label:",i,yLabels[i])
y_prime = yFeatures[:,i]
reg = linearRegression.linearRegression(X, y_prime)
print()
global learning: R^2 score: 0.14801742877413732 label: 0 imdb_rating R^2 score: 0.0031848902798871093 label: 1 metascore R^2 score: 0.008595016025802593 label: 2 imdb_votes R^2 score: 0.11691434914755117 label: 3 awards R^2 score: 0.07466758239374383 label: 4 revenue R^2 score: 0.5078484708793156 label: 5 popularity_score R^2 score: 0.1768942639184985
Since revenue has the highest score, revenue looks to be the easiest factor to predict in movie success for the given y values. By best indicator of success, we mean
The below code block tests the dataset without PCA to find out which Y feature is the best indicator of a movie's success.
#Using non-Reduced Features
X = xFeatures
y = yFeatures
#print(y.shape)
#global learning
print("global learning:")
reg = linearRegression.linearRegression(X, y)
print()
#select feature learning
for i in range(y.shape[1]):
print("label:",i,yLabels[i])
y_prime = yFeatures[:,i]
reg = linearRegression.linearRegression(X, y_prime)
print()
global learning: R^2 score: 0.15004604407682512 label: 0 imdb_rating R^2 score: 0.0033368656498697913 label: 1 metascore R^2 score: 0.011772564300144506 label: 2 imdb_votes R^2 score: 0.11787092665256937 label: 3 awards R^2 score: 0.07617886955216091 label: 4 revenue R^2 score: 0.5116216387074242 label: 5 popularity_score R^2 score: 0.1794953995987596
Again, revenue has the highest score, revenue looks to be the easiest factor to predict in movie success for the given y values. Therefore, we will be proceeding with movie revenue as our output (y-axis).
We now want a model that explores the relationship between our xfeatures, producers, budget, and highest star revenue, and the movie's revenue. We would want to know for any given x feature, how much movie revenue it would produce.
For this we will be doing linear regression to find how accurate and strong the the relationship between ech of the X features to movie revenue and how much of each X feature's input is necessary to produce a given revenue.
To test the accuracy of each relationsip, we will be using r2 score (defined in regression object.score), rmse, and We will reserve the first 10% of the data set as the test data set and the rest as the training data set.
First, we will be testing number of producers.
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
X = PCA.returnDimension(reducedXFeatures, xModel)
y = yFeatures
N = np.shape(X)[0]
numTest = int(0.1 * N)
reg = linearRegression.linearRegression(X[numTest + 1:, 0:1], y[numTest + 1:,4])
pred = reg.predict(X[:numTest, 0:1])
rmse = metrics.mean_squared_error(y[ :numTest,4],pred, squared = False)
print("Normalized RMSE:", rmse/(np.max(pred) - np.min(pred)))
print("Mean Squared Error:", metrics.mean_squared_error(y[ :numTest,4],pred, squared = True))
print("Average revenue per producers", reg.coef_)
line = plt.plot(X[:numTest, 0:1], pred)
thing = plt.scatter(X[numTest + 1:, 0:1],y[numTest + 1:,4], color="green")
plt.legend()
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
R^2 score: 0.4236563610684021 Normalized RMSE: 0.29445077965965505 Mean Squared Error: 5.4919648213861304e+16 Average revenue per producers [2.99456329e+08]
Now we will do the budget feature in X
reg = linearRegression.linearRegression(X[numTest + 1:, 1:2], y[numTest + 1:,4])
pred = reg.predict(X[:numTest, 1:2])
rmse = metrics.mean_squared_error(y[ :numTest,4],pred, squared = False)
print("Normalized RMSE:", rmse/(np.max(pred) - np.min(pred)))
print("Mean Squared Error:", metrics.mean_squared_error(y[ :numTest,4],pred, squared = True))
print("Average number of revenue per $1 in budget", reg.coef_)
lines =plt.plot(X[:numTest, 1:2], pred)
plt.scatter(X[numTest + 1:, 1:2],y[numTest + 1:,4], color="green")
plt.show()
R^2 score: 0.4912014598650998 Normalized RMSE: 0.290780439097314 Mean Squared Error: 4.779225354820937e+16 Average number of revenue per $1 in budget [3.17223404]
Finally, we will do the highest paid actor in a movie.
reg = linearRegression.linearRegression(X[numTest + 1:, 2:], y[numTest + 1:,4])
pred = reg.predict(X[:numTest, 2:])
rmse = metrics.mean_squared_error(y[ :numTest,4],pred, squared = False)
print("Normalized RMSE:", rmse/(np.max(pred) - np.min(pred)))
print("Mean Squared Error:", metrics.mean_squared_error(y[ :numTest,4],pred, squared = True))
print("Ratio of Revenue earned: highest Paid Actor", reg.coef_)
plt.plot(X[:numTest, 2:], pred, color="green")
plt.scatter(X[numTest + 1:, 2:],y[numTest + 1:,4])
plt.show()
R^2 score: 0.10973223694468126 Normalized RMSE: 0.6573184514449396 Mean Squared Error: 7.482833295461304e+16 Ratio of Revenue earned: highest Paid Actor [0.48788192]
Analysis
From what we learned, revenue is the easiest factor to predict in the success of the movie. After conducting linear regression, we find that budget and number of producers have a decent accuracy in predicting movie revenue. Their Normalized RMSE is around 0.29 which is reasonably good. Their MSE is 5.49 and 4.78e16 respectively. The producer is decent considering the lowest producer value is 6.5, but the MSE for budget is questionable because it seems many budgets are lower than that value. For this the normalized RMSE may be more useful. Regardless, it looks like both of them are fairly accurate in predicting movie revenue. R^2 scores for both of them are ok as well, considering they both have scores around 0.5 and the max score is 1. This means the a good proportion of movie revenue's variance is explained by both budget and number of producers.
In comparison, max actor income is not very accurate in predicting movie revenue. Its normalized RMSE is 0.657 which is bad compared to our other features, all of which have values of around 0.29. The MSE values also show this. It has 7.48e16 when the highest value is around 1.1e9, which is signficantly higher. This means max actor income isn'tt very accurate. Its R^2 score is 0.1097 which means the maximum actor income does not explain the variance for movie revenue very well.
Now knowing which features are accurate, we want to know how much resources to put into it to get a certain revenue from a movie. We get that from linear regression's coefficient. It turns out $3.17 is earned in revenue per $1 of budget and for each producer, $2.99e08 are gained per producer. This allows us to know how much each $1 in the budget is worth and how much each producer is worth, which will be helpful in determining movie success.
Random Forest:
Another we can use to predict a movie's success is through a random forest regressor. A random forest regressor does not overfit and can handle missing data. Furthermore, it is fast and efficient, and hyperparameters do not need to be tuned. This allows it to work with large datasets, meaning we can effectively add more data without losing too much time. In addition to it scalability, not being able to fit means we do not need to separate into test and training data for the model (although we chose to do so anyways for the model accuracy but not for each individual labels).
To evaulate how good our models are, we will be using the R^2 score and MSE for the same reason as linear regression. However, we will be evaluating random forest classifier with MSE rather than RMSE, because random forest seems to capture general trends better than linear regrression and will sometimes exaggerate its trends. This may create more error than and make the model seem less accurate, since a trend is more imprtant than an exact number in predicting a movie's success. MSE is used to compare to this normalized MSE value.
from src.randomForest import randomForest
from sklearn import metrics
X = xFeatures
y = yFeatures
print(y.shape)
print(X.shape)
print(xLabels)
print(yLabels)
for i in range(y.shape[1]):
print(f"Training model to predict {yLabels[i]}")
reg = randomForest.randomForest(X, y[:,i])
print('\n')
(1347, 6) (1347, 3) ['producers', 'budget', 'highest_star_revenue'] ['imdb_rating', 'metascore', 'imdb_votes', 'awards', 'revenue', 'popularity_score'] Training model to predict imdb_rating Training Score : 0.8502414953524309 Test score: -0.042033993840185246 Training model to predict metascore Training Score : 0.858252230563187 Test score: 0.16993996835752878 Training model to predict imdb_votes Training Score : 0.8578800992679925 Test score: 0.057844445677811995 Training model to predict awards Training Score : 0.8530424851208551 Test score: -0.08107812499999967 Training model to predict revenue Training Score : 0.9247816748852608 Test score: 0.6086262053343992 Training model to predict popularity_score Training Score : 0.8542878747255873 Test score: 0.11783838849212125
#revenue usng forest regressor and plotting first 50
from sklearn.model_selection import train_test_split
print(f"Training model to predict {yLabels[4]}")
reg = randomForest.randomForest(X, y[:,4])
print('\n')
x_train, x_test, y_train, y_test = train_test_split(xFeatures, yFeatures, random_state=1, test_size = 0.1)
y_rev = y_test[:, 4]
pred_rev = reg.predict(x_test)
# print(y_imdb)
# print(pred_imdb)
plt.figure()
plt.plot(y_rev[0:50], color = 'blue')
plt.plot(pred_rev[0:50], color = 'red')
plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.show()
print("Revenue Normalized MSE:", metrics.mean_squared_error(y_rev, pred_rev, squared=False))
print("Revenue R^2 Error:", metrics.r2_score(y_rev, pred_rev))
print("Revenue Normalized MSE:", metrics.mean_squared_error(y_rev, pred_rev, squared=False) /(np.max(pred_rev) - np.min(pred_rev)))
Training model to predict revenue Training Score : 0.9214863716632893 Test score: 0.6171737156621906
Revenue Normalized MSE: 128728640.45357291 Revenue R^2 Error: 0.6171737156621906 Revenue Normalized MSE: 0.11627703683729285
#metascore rating usng forest regressor and plotting next 50
from sklearn.model_selection import train_test_split
print(f"Training model to predict {yLabels[1]}")
reg = randomForest.randomForest(X, y[:,1])
print('\n')
x_train, x_test, y_train, y_test = train_test_split(xFeatures, yFeatures, random_state=1, test_size = 0.1)
y_meta = y_test[:, 1]
pred_meta = reg.predict(x_test)
plt.figure()
plt.plot(y_meta[0:50], color = 'blue')
plt.plot(pred_meta[0:50], color = 'red')
plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.show()
print("Metascore Normalized MSE:", metrics.mean_squared_error(y_meta, pred_meta, squared=False))
print("Metascore R^2 Error:", metrics.r2_score(y_meta, pred_meta))
print("Metascore Normalized MSE:", metrics.mean_squared_error(y_meta, pred_meta, squared=False) /(np.max(pred_meta) - np.min(pred_meta)))
Training model to predict metascore Training Score : 0.8603002819441713 Test score: 0.12442883561610174
Metascore Normalized MSE: 21.154185066359457 Metascore R^2 Error: 0.12442883561610174 Metascore Normalized MSE: 0.2392736688876762
#imdb usng forest regressor and plotting 50
from sklearn.model_selection import train_test_split
print(f"Training model to predict {yLabels[0]}")
reg = randomForest.randomForest(X, y[:,0])
print('\n')
x_train, x_test, y_train, y_test = train_test_split(xFeatures, yFeatures, random_state=1, test_size = 0.1)
y_imdb = y_test[:, 0]
pred_imdb = reg.predict(x_test)
plt.figure()
plt.plot(y_meta[0:50], color = 'blue')
plt.plot(pred_meta[0:50], color = 'red')
plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.show()
print("IMDB Normalized MSE:", metrics.mean_squared_error(y_imdb, pred_imdb, squared=False))
print("IMDB R^2 Error:", metrics.r2_score(y_imdb, pred_imdb))
print("IMDB Normalized MSE:", metrics.mean_squared_error(y_imdb, pred_imdb, squared=False) /(np.max(pred_imdb) - np.min(pred_imdb)))
Training model to predict imdb_rating Training Score : 0.8474086168833846 Test score: -0.05514071807786558
IMDB Normalized MSE: 1.3042981569811891 IMDB R^2 Error: -0.05514071807786558 IMDB Normalized MSE: 0.1952542151169445
Analysis:
The random forest regression is really good at accurately depicting trends between our labels and our data. Our X features: number of producers and highest star revenue can accurately predict Imdb ratings, metascore ratings, and revenue. They have normalized MSE values of 0.178, 0.239, and 0.119. These are very low MSE. This means there is relatively low error between predicted and training labels, meaning the random forest can accurately predict what a movie would make given X features that a person may want to use for a movie, at least for the three values tested. In addition to a low normalized MSE score for all features, the revenue also has a high R^2 score of almost 62%. This indicates that a lot of the variance in revenues can be predicted from our x features. Revenue is especially high, which supports it as being the best indicator of success.
While we can reasonably predict when given the X features, we can seem to depict a general trend. It seems either our model or the nature of the data itself cannot find an equation to approximate and calculate the revenue that would be generated, meaning we have to run the model multiple times with multiple x feature values until we can get a desired label value. while this is not much of a downside because random forest is fast, it is tedious to try multiple values and potentially time consuming checking each x feature input to see whether they would reach a desired label.
Now we have enough evidence to conclude that revenue is a fairly good indicator for success. Its linear regression model has the lowest RMSE and the highest R^2 score. Random forest also determined that revenue had the lowest MSE and the highest R^2 score with really good accuracy. This suggests that revenue is the best label for success. Here, we use NLP to test a model on revenue to see if whether a script is a reasonable predictor of movie revenue and whether we can accurately predict a movie's revenue using its script.
NLP:
We wanted to give this idea a final shot by using an NLP-based approach based on the assumption that movies with similar plots will have similar revenue. First, we parsed movie overviews from our existing dataset(short descriptions of movies) and split the data into testing and training data. Then, we converted those overviews into vectors using the Python library sentence-transformers which provides an easy-to-use API for making embeddings. The goal of the model is to cluster similar movies together in hopes of improving prediction accuracy for the different success metrics for the films.
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print(len(data))
temp_x = [i for i in data if data[i]['revenue'] > 10000000]
print(len(temp_x))
x_train = list(temp_x)[:int(len(temp_x) * 0.8)]
x_test = list(temp_x)[int(len(temp_x) * 0.8):]
y_train = [data[key]['revenue'] for key in x_train]
y_test = [data[key]['revenue'] for key in x_test]
print(f"training data : {len(x_train), len(y_train)}, testing data : {len(x_test), len(y_test)}")
e_map = {}
try:
with open('sentence_encoding.pickle', 'rb') as handle:
e_map = pickle.load(handle)
except:
for i in x_train:
e_map[i] = embedder.encode([data[i]['overview']])[0]
with open('sentence_encoding.pickle', 'wb') as handle:
pickle.dump(e_map, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Perform kmean clu stering
num_clusters = 30
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(list(e_map.values()))
cluster_assignment = clustering_model.labels_
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
clustered_sentences[cluster_id].append(list(e_map.keys())[sentence_id])
for i, cluster in enumerate(clustered_sentences):
print("Cluster ", i+1)
print(cluster)
print("")
1347 1107 training data : (885, 885), testing data : (222, 222) Cluster 1 ['The World Is Not Enough', 'The Abyss', 'Avatar', 'Deep Rising', 'The Hunt for Red October', 'Master and Commander: The Far Side of the World', "Ocean's Twelve", 'Paul', 'Pirates of the Caribbean: The Curse of the Black Pearl', 'The Rock', 'Serenity', 'Sphere', 'Three Kings', 'Sanctum', 'Ghost Ship'] Cluster 2 ['The Deep End of the Ocean', 'The Exorcist', 'Practical Magic', 'A Nightmare on Elm Street 5: The Dream Child', 'Bad Teacher', 'Annabelle', 'The Box', 'Carrie', 'Case 39', 'Changeling', 'Colombiana', 'Coraline', 'Cruel Intentions', 'Date Night', 'Drag Me to Hell', 'Drive Angry', 'Eastern Promises', 'Easy A', 'Erin Brockovich', 'Eternal Sunshine of the Spotless Mind', 'The Fault in Our Stars', 'Final Destination 2', 'Gothika', 'The Grudge', 'Hanna', "Jennifer's Body", 'Juno', 'The Long Kiss Goodnight', 'The Next Three Days', "One Flew Over the Cuckoo's Nest", 'Peggy Sue Got Married', 'Precious', 'Prom Night', 'The Roommate', 'Stir of Echoes', 'The Rage: Carrie 2', 'What Lies Beneath', 'When a Stranger Calls', 'End of Days', 'The Boss', 'The Conspirator', 'The Debt', 'The Howling', 'The Phantom of the Opera', 'Labor Day', 'Duplex', 'Flightplan', 'Good Luck Chuck', 'A Nightmare on Elm Street', 'Philomena', 'The Scarlet Letter', "Winter's Bone", 'Atonement', "Pan's Labyrinth"] Cluster 3 ['The Avengers', 'Malcolm X', 'Hollow Man', 'Ali', 'Apocalypse Now', 'Austin Powers: The Spy Who Shagged Me', 'The Black Dahlia', 'Broken Arrow', 'Confessions of a Dangerous Mind', 'Dances with Wolves', 'Despicable Me 2', 'G.I. Joe: The Rise of Cobra', 'Hard Rain', 'Hellboy', 'The Ides of March', 'Inglourious Basterds', 'Kung Fu Panda', 'The Last Samurai', 'Lord of War', 'The Losers', 'Mad Max 2: The Road Warrior', 'The Manchurian Candidate', 'The Mask', 'Public Enemies', 'Straight Outta Compton', 'Valkyrie', 'Wild Wild West', 'xXx', 'Boyz n the Hood', 'J. Edgar', 'The Naked Gun 2½: The Smell of Fear', 'Patton', 'Rambo: First Blood Part II', 'RED', 'Iron Man', 'American Outlaws', 'Butch Cassidy and the Sundance Kid', 'Courage Under Fire', 'The Four Feathers', 'Glory Road', 'The Thin Red Line', 'The Incredible Hulk', '47 Ronin', 'Despicable Me'] Cluster 4 ['A Night at the Roxbury', 'The Change-Up', 'Gremlins', 'Hall Pass', 'The Hangover', 'Happy Feet', 'High Fidelity', 'Horrible Bosses', 'Legend', 'Scott Pilgrim vs. the World', 'Speed Racer', 'This Is 40', 'Disturbing Behavior', "A Hard Day's Night", 'Talladega Nights: The Ballad of Ricky Bobby', 'The Flintstones', 'Poltergeist', 'Stuart Little 2'] Cluster 5 ['The Blair Witch Project', 'Alien³', 'The Boxtrolls', 'Chronicle', 'Ex Machina', 'Fantastic Four', 'The Fifth Element', 'Final Destination', 'I Am Number Four', 'Monte Carlo', 'Moonrise Kingdom', 'Mud', 'The Pacifier', 'Pandorum', 'Panic Room', 'White Squall', 'Wild Hogs', 'As Above, So Below', 'Scooby-Doo', 'Sorority Row', 'The Crazies', 'The Fog', 'The Mist', 'Timeline', 'The Faculty', 'Jeepers Creepers', 'E.T. the Extra-Terrestrial'] Cluster 6 ['Fantasia 2000', 'The Princess Bride', '12 Years a Slave', 'Aladdin', 'Anastasia', 'Antz', 'The Lord of the Rings: The Fellowship of the Ring', 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe', 'Clash of the Titans', 'Conan the Barbarian', 'Dragonslayer', 'Frozen', 'Hellboy II: The Golden Army', 'How to Train Your Dragon 2', 'How to Train Your Dragon', 'Into the Woods', 'The Lord of the Rings: The Return of the King', 'The Lord of the Rings: The Two Towers', 'Mulan', 'The Prestige', 'Queen of the Damned', 'Rise of the Guardians', 'Shrek the Third', 'Shrek', 'Snow White and the Huntsman', 'The Hobbit: An Unexpected Journey', 'Thor', 'The Three Musketeers', 'The Wizard of Oz', 'Alice in Wonderland', 'Big Trouble in Little China', 'Percy Jackson & the Olympians: The Lightning Thief', 'Prince of Persia: The Sands of Time', 'The NeverEnding Story', 'The Smurfs', "The Sorcerer's Apprentice", 'Time Bandits', 'Your Highness', 'Beowulf', 'Highlander: The Final Dimension', 'Highlander: Endgame', 'Kingdom of Heaven', 'The Lion King', 'Chocolat'] Cluster 7 ['The Addams Family', 'Resident Evil: Afterlife', 'Batman Returns', 'Beasts of the Southern Wild', 'Fantastic Mr. Fox', 'Finding Nemo', 'Frankenweenie', 'Gremlins 2: The New Batch', 'The Haunting', 'Hellraiser', 'Indiana Jones and the Last Crusade', 'Land of the Dead', 'ParaNorman', 'Pet Sematary', 'Priest', 'Psycho', 'Red Riding Hood', 'TMNT', 'Witness', '10 Cloverfield Lane', 'Indiana Jones and the Kingdom of the Crystal Skull', 'Night at the Museum: Battle of the Smithsonian', 'Paranormal Activity', 'We Bought a Zoo', 'Beetlejuice', 'Batman Forever'] Cluster 8 ['A Nightmare on Elm Street 3: Dream Warriors', 'A Nightmare on Elm Street 4: The Dream Master', "Freddy's Dead: The Final Nightmare", 'Halloween', 'Halloween: Resurrection', 'Blade: Trinity', 'Blade', 'Dawn of the Dead', 'The Evil Dead', 'Fright Night', 'Insidious', 'Super 8', '1408', '30 Days of Night', 'Black Christmas', 'Frailty', 'Blade II', 'Halloween 4: The Return of Michael Myers', 'Twilight Zone: The Movie', 'House on Haunted Hill', 'Scary Movie 2', 'The Sixth Sense', 'Let Me In'] Cluster 9 ['Amour', 'Blue Valentine', 'Cars 2', 'Chasing Amy', 'The Deer Hunter', 'Dumb and Dumber', 'Love & Basketball', "My Best Friend's Wedding", 'Orphan', 'Runaway Bride', "Something's Gotta Give", 'Water for Elephants', 'Unfaithful', 'Lars and the Real Girl', 'Alfie', 'Catwoman', 'Pearl Harbor', 'The Wedding Date', 'When Harry Met Sally...', 'Enough Said'] Cluster 10 ['Alien', 'Jason X', 'King Kong', '2012', 'District 9', 'Alien: Resurrection', 'Aliens', 'Angels & Demons', 'Armageddon', 'The Book of Eli', 'The Croods', 'The Day the Earth Stood Still', 'Dune', 'Edward Scissorhands', 'Flash Gordon', 'Gattaca', 'Gravity', 'The Hills Have Eyes', "The Hitchhiker's Guide to the Galaxy", 'Raiders of the Lost Ark', 'Indiana Jones and the Temple of Doom', 'Interstellar', 'The Island', 'Legion', 'Life of Pi', 'Lost in Space', 'Mission to Mars', 'The Postman', 'Prometheus', 'The Road', "Schindler's List", 'Signs', 'Starship Troopers', 'The Theory of Everything', 'The Thing', 'Year One', 'Escape from the Planet of the Apes', 'In Bruges', 'Piranha 3D', 'Resident Evil: Extinction', 'Skyline', 'The Mothman Prophecies', 'The Time Machine', 'Planet of the Apes', 'Beneath the Planet of the Apes', 'Children of Men', 'Close Encounters of the Third Kind', 'Cloud Atlas', "Ender's Game", 'The English Patient', 'The Island of Dr. Moreau', 'Out of Africa', 'Seven Years in Tibet', 'Space Cowboys', 'Battlefield Earth', 'Elysium'] Cluster 11 ['The Running Man', 'The American', 'Assassins', 'Blade Runner', 'Crank', 'The Crow', 'The Dark Knight Rises', 'Deadpool', 'Django Unchained', 'Flight', 'Gangs of New York', 'Get Carter', 'The Ghost and the Darkness', 'Gladiator', 'Good Will Hunting', 'JFK', 'The Life of David Gale', 'Machete', 'Meet Joe Black', 'Ninja Assassin', 'No Country for Old Men', 'Phone Booth', "Pirates of the Caribbean: Dead Man's Chest", 'The Revenant', 'Robin Hood: Prince of Thieves', 'Saw', 'Scream 2', 'Scream 3', 'Se7en', 'Suspect Zero', 'Wanted', 'Watchmen', 'The Punisher', 'Urban Legend', 'Zodiac', '3:10 to Yuma', 'Hero', 'Kill Bill: Vol. 2', 'Kill Bill: Vol. 1', "Monster's Ball", 'Vantage Point', 'Batman & Robin', 'Batman Begins', 'Edge of Tomorrow'] Cluster 12 ['The Crying Game', 'The Constant Gardener', 'A Few Good Men', 'Barry Lyndon', 'The Bodyguard', 'Cradle 2 the Grave', 'Fair Game', 'The Fugitive', 'Liar Liar', 'Lone Star', 'Man on Fire', 'The Men Who Stare at Goats', 'Mission: Impossible', 'Out of Sight', 'Sherlock Holmes', 'Sicario', 'The Silence of the Lambs', 'Skyfall', 'Source Code', 'Tombstone', 'Whiteout', 'The Bourne Supremacy', 'From Hell', 'Payback', 'Salt', 'Thank You for Smoking', 'The Guard', 'The Jackal', 'Unforgiven', 'White House Down', '16 Blocks', 'The Adventures of Ford Fairlane', 'Enemy of the State', 'Goldfinger', 'I Think I Love My Wife', 'In the Valley of Elah', 'John Q', 'Minority Report', 'Mission: Impossible II', 'Street Kings', 'Prisoners', 'Training Day', 'Fast Five'] Cluster 13 ['127 Hours', 'The Adjustment Bureau', 'Amadeus', 'American Beauty', 'Big Fish', 'Big', 'Boyhood', 'The Butterfly Effect', 'The Curious Case of Benjamin Button', 'Dark City', 'Forrest Gump', 'The Game', 'Ghost Rider', 'Her', 'Hot Tub Time Machine', 'How to Lose Friends & Alienate People', 'The Invention of Lying', 'The Jacket', 'Larry Crowne', 'The Perks of Being a Wallflower', 'Rise of the Planet of the Apes', 'Room', 'The Secret Life of Walter Mitty', 'Unbreakable', 'Up', 'A Christmas Carol', 'About Time', 'Back to the Future Part II', 'Click', 'The Village', 'Barbershop', 'Elf', 'Frequency', 'Hook', 'Hustle & Flow', "Mr. Holland's Opus", 'Rent', 'Stranger Than Fiction', 'A Christmas Story', 'Back to the Future'] Cluster 14 ['The Big Lebowski', 'The Life Aquatic with Steve Zissou', '42', 'A Serious Man', 'American Hustle', 'As Good as It Gets', 'Being John Malkovich', 'Burn After Reading', 'The Devil Wears Prada', 'The Doors', 'The Elephant Man', 'Fast Times at Ridgemont High', 'Funny People', 'Get on Up', 'Groundhog Day', 'Harold & Kumar Go to White Castle', 'The Insider', 'Jay and Silent Bob Strike Back', 'Jerry Maguire', 'Man on the Moon', 'Moneyball', 'My Week with Marilyn', 'The Pianist', 'The Producers', 'Syriana', 'The Ugly Truth', 'Up in the Air', 'Cop Land', 'Diary of a Wimpy Kid', 'Sausage Party', 'Scrooged', 'Last Action Hero', 'Looney Tunes: Back in Action', 'Quartet', 'All That Jazz'] Cluster 15 ['15 Minutes', '30 Minutes or Less', 'Absolute Power', 'Bad Santa', 'Black Rain', 'Cellular', 'The Departed', 'Die Hard', 'Donnie Brasco', 'Drive', 'Eagle Eye', 'Entrapment', 'From Dusk Till Dawn', 'Hostage', 'Insomnia', 'The Italian Job', 'Jackie Brown', 'L.A. Confidential', 'Law Abiding Citizen', 'Looper', 'Max Payne', 'Mirrors', "Ocean's Eleven", 'Office Space', 'Pineapple Express', 'Reindeer Games', 'The Relic', 'Ronin', 'Snatch', 'Identity', 'Ishtar', 'Taken', 'The Town', 'Tower Heist', 'Collateral', 'Fun with Dick and Jane', 'Inside Man', 'The Place Beyond the Pines', 'Seven Psychopaths', 'Speed', 'True Romance'] Cluster 16 ['Chill Factor', 'Airplane!', 'Argo', 'The Boondock Saints II: All Saints Day', 'Collateral Damage', 'Die Hard 2', 'Do the Right Thing', 'Escape from L.A.', 'Escape from New York', 'Face/Off', 'Jaws 2', 'Jaws', 'The Kingdom', 'Men in Black', 'Oblivion', 'Predator', 'Rush Hour 2', 'The Siege', 'Thirteen Days', 'Thunderbirds', 'Tomorrow Never Dies', 'Unknown', 'Con Air', 'Predator 2', 'Rendition', 'The Expendables', 'The Happening', 'Black Hawk Down', 'Air Force One', 'Conspiracy Theory', 'Executive Decision', 'Good Night, and Good Luck.', 'Munich', 'Olympus Has Fallen', 'Outbreak', 'Resident Evil', 'The X Files', 'Bridge of Spies', 'Chain Reaction'] Cluster 17 ['17 Again', 'American History X', 'The Blind Side', 'The Bounty Hunter', 'Breakdown', 'The Cable Guy', 'Cast Away', "Cirque du Freak: The Vampire's Assistant", 'Disturbia', 'Extract', 'I Love You Phillip Morris', 'Milk', 'Next Friday', 'Observe and Report', 'Semi-Pro', "She's Out of My League", 'War of the Worlds', 'Wild Things', 'The Wrestler', 'Days of Thunder', '8 Mile', 'Ace Ventura: Pet Detective', 'Anchorman: The Legend of Ron Burgundy', 'The Girl Next Door', 'Elizabethtown', 'The Karate Kid', 'Little Black Book', 'Mean Girls', 'What Women Want', 'Any Given Sunday', 'Crazy Heart'] Cluster 18 ['Halloween II', 'The American President', 'Anna Karenina', 'Cold Mountain', 'Crazy, Stupid, Love.', 'The Descendants', 'Garden State', "It's Complicated", 'Lost in Translation', 'Major League', 'The Proposal', 'Silver Linings Playbook', 'Walking Tall', 'Yes Man', 'Scream 4', 'The Bridges of Madison County', 'Young Adult', 'About Schmidt', 'Cheaper by the Dozen', 'The Best Exotic Marigold Hotel'] Cluster 19 ['American Psycho', 'Arbitrage', "The Devil's Advocate", 'Glengarry Glen Ross', 'The Good Girl', 'Inception', 'The Informant!', 'The Lincoln Lawyer', 'The Mechanic', 'Philadelphia', 'The Social Network', 'The Verdict', 'Wall Street: Money Never Sleeps', 'Wall Street', 'The Wolf of Wall Street', 'Repo Men', 'The Apartment', 'Boiler Room', 'Casino Royale', 'Network'] Cluster 20 ['8MM', 'Antitrust', 'Commando', 'Fight Club', 'Gamer', 'The Girl with the Dragon Tattoo', 'Margin Call', 'The Matrix Reloaded', 'The Matrix', 'Point Break', 'Surrogates', 'Swordfish', 'Total Recall', 'Live Free or Die Hard', 'The Matrix Revolutions', 'WarGames', 'The Thirteenth Floor', 'From Russia with Love', 'Hitman', 'The Ninth Gate', 'You Only Live Twice', 'The Saint', 'Untraceable'] Cluster 21 ['An Education', 'Annie Hall', 'Big Eyes', 'Black Swan', 'Bridesmaids', 'Burlesque', "Charlie's Angels", 'Drop Dead Gorgeous', 'Enough', 'The Help', 'The Imaginarium of Doctor Parnassus', 'The Kids Are All Right', 'Legally Blonde', 'Notting Hill', 'Pretty Woman', 'Rush', 'Saving Mr. Banks', 'Shakespeare in Love', 'Titanic', 'The Tourist', 'Secretariat', 'The House Bunny', 'The Sound of Music', "The Time Traveler's Wife", 'The Young Victoria', 'Amélie', 'Gosford Park', 'Almost Famous', 'Girl with a Pearl Earring', 'Jaws: The Revenge', 'Little Miss Sunshine', 'A Mighty Heart', 'Some Like It Hot', "Coal Miner's Daughter", 'For Colored Girls'] Cluster 22 ['Jason Goes to Hell: The Final Friday', 'Friday the 13th Part 2', 'Friday the 13th Part VI: Jason Lives', 'Friday the 13th: The Final Chapter', 'The Bourne Identity', 'The Bourne Ultimatum', 'Buried', 'Cliffhanger', "National Lampoon's Vacation", 'Friday the 13th Part VIII: Jason Takes Manhattan', 'Ghost', 'The Green Mile', 'I Still Know What You Did Last Summer', 'The Martian', 'The Master', 'The Reader', 'Saving Private Ryan', 'Sling Blade', 'Robin Hood', "The General's Daughter", 'Halloween: The Curse of Michael Myers', 'Dear John', 'Secret Window', 'Sweet November'] Cluster 23 ['The Godfather', 'The Godfather: Part II', 'Stigmata', 'American Gangster', 'Analyze That', 'The Fighter', 'The French Connection', 'From Here to Eternity', 'Only God Forgives', 'Pulp Fiction', 'Raging Bull', 'Rocky', 'Scarface', 'The Shawshank Redemption', 'The Talented Mr. Ripley', 'War Horse', 'Warrior', 'We Own the Night', 'Born on the Fourth of July', "The Devil's Own", 'Million Dollar Baby', 'Rocky Balboa'] Cluster 24 ['Napoleon Dynamite', '10 Things I Hate About You', 'All About Steve', 'American Graffiti', "He's Just Not That Into You", 'No Strings Attached', 'Superbad', 'Tin Cup', 'Sex Drive', 'The Switch', "Valentine's Day", 'Notes on a Scandal', 'Before Sunset', "There's Something About Mary", 'Before Midnight'] Cluster 25 ['Spider-Man', 'Austin Powers: International Man of Mystery', 'Hancock', 'I, Robot', 'The Lego Movie', "Logan's Run", 'Megamind', 'Men in Black 3', 'Small Soldiers', 'Star Wars: Episode II - Attack of the Clones', 'Star Wars: Episode I - The Phantom Menace', 'Terminator 2: Judgment Day', 'Terminator Salvation', 'The Terminator', 'After Earth', 'Green Lantern', 'Terminator 3: Rise of the Machines', 'X-Men Origins: Wolverine', 'K-PAX', 'RoboCop', 'Star Wars: Episode III - Revenge of the Sith', 'Superman II', 'Superman III', 'Superman IV: The Quest for Peace', 'Superman'] Cluster 26 ['The Lost World: Jurassic Park', 'Jurassic Park', 'Lake Placid', 'Jurassic World', 'Night at the Museum', 'Super Mario Bros.', 'Monsters, Inc.', 'Doctor Zhivago', 'Jurassic Park III'] Cluster 27 ['Angel Eyes', 'Babel', 'Crash', 'House of 1000 Corpses', 'Intolerable Cruelty', 'Magnolia', 'Marley & Me', 'Midnight in Paris', 'Pride & Prejudice', 'Revolutionary Road', 'Romeo + Juliet', 'Sense and Sensibility', 'Spanglish', '21 Grams', 'Another Year', 'The Horse Whisperer', 'Casablanca', 'The Fisher King', 'Little Children', 'Memoirs of a Geisha', 'A Perfect Getaway', 'Vicky Cristina Barcelona', "Rosemary's Baby", 'Far from Heaven'] Cluster 28 ['A Most Violent Year', 'Alone in the Dark', 'Analyze This', 'Boogie Nights', 'Bruce Almighty', 'Casino', 'Catch Me If You Can', 'Dallas Buyers Club', 'Fear and Loathing in Las Vegas', 'Fruitvale Station', 'Go', 'The Great Gatsby', 'Grosse Pointe Blank', 'Into the Wild', 'Knocked Up', 'Les Misérables', 'Midnight Cowboy', 'Nine', 'Taxi Driver', 'Trainspotting', 'Vanilla Sky', 'The Terminal', 'This Is the End', '25th Hour', 'Blast from the Past', 'The Diving Bell and the Butterfly', "Get Rich or Die Tryin'", 'A Good Year', 'The Lost Weekend', 'Leaving Las Vegas'] Cluster 29 ['Spaceballs', 'Event Horizon', 'Pitch Black', 'Star Trek: First Contact', 'Star Trek: Generations', 'Star Trek II: The Wrath of Khan', 'Star Trek: Nemesis', 'Star Trek', 'Planet 51', 'Star Trek VI: The Undiscovered Country', 'Independence Day', 'Star Trek III: The Search for Spock', 'Star Trek IV: The Voyage Home', 'Star Trek V: The Final Frontier', 'Star Trek: Insurrection', 'Star Trek: The Motion Picture'] Cluster 30 ['Braveheart', 'The Last of the Mohicans', 'The Messenger: The Story of Joan of Arc', 'Anonymous', 'Dead Poets Society', 'Dogma', 'Elizabeth: The Golden Age', 'Gandhi', 'G.I. Jane', 'Invictus', "The King's Speech", 'Lincoln', 'The Patriot', 'The Queen', 'Traffic', '28 Weeks Later', 'Exorcist: The Beginning', 'Lawrence of Arabia', 'Letters from Iwo Jima', 'Troy', 'V for Vendetta', 'The Birth of a Nation']
from collections import defaultdict
mappings = defaultdict(dict)
for i in range(len(clustered_sentences)):
for movie in clustered_sentences[i]:
for label in yLabels:
mappings[label][i] = mappings[label].get(i, 0) + data[movie][label]
for label in yLabels:
mappings[label][i] = mappings[label][i]/len(clustered_sentences[i])
for label in yLabels:
print(f"{label}")
print({k: v for k, v in sorted(mappings[label].items(), key=lambda item: item[1], reverse=True)})
imdb_rating {22: 7.2272727272727275, 29: 6.954545454545454, 18: 6.95, 26: 6.75, 20: 6.6571428571428575, 14: 6.585365853658536, 10: 6.568181818181818, 12: 6.55, 0: 6.533333333333333, 5: 6.5227272727272725, 19: 6.521739130434782, 11: 6.511627906976744, 13: 6.485714285714286, 27: 6.466666666666667, 17: 6.45, 28: 6.4375, 2: 6.363636363636363, 6: 6.346153846153846, 23: 6.333333333333333, 25: 6.333333333333333, 21: 6.291666666666667, 24: 6.24, 15: 6.153846153846154, 9: 6.017857142857143, 16: 6.0, 1: 5.962962962962963, 8: 5.9, 4: 5.888888888888889, 7: 5.608695652173913, 3: 5.555555555555555} metascore {22: 71.77272727272727, 29: 70.31818181818181, 18: 69.1, 26: 68.0, 13: 65.4, 20: 65.4, 17: 63.55, 23: 62.8, 27: 61.03333333333333, 6: 60.46153846153846, 5: 60.45454545454545, 12: 59.825, 2: 59.22727272727273, 11: 58.93023255813954, 8: 57.3, 14: 56.53658536585366, 28: 56.3125, 0: 55.8, 24: 55.76, 9: 55.375, 1: 55.2037037037037, 15: 54.17948717948718, 10: 53.45454545454545, 16: 53.45161290322581, 19: 52.30434782608695, 25: 50.77777777777778, 21: 49.625, 3: 49.22222222222222, 7: 48.73913043478261, 4: 47.851851851851855} imdb_votes {22: 515949.7272727273, 10: 424739.5227272727, 25: 376706.77777777775, 24: 364022.88, 5: 362801.0681818182, 12: 340546.85, 18: 338513.45, 19: 321424.9130434783, 0: 297774.13333333336, 9: 296324.28571428574, 29: 272464.7727272727, 14: 268751.46341463417, 27: 260319.76666666666, 2: 255879.06818181818, 21: 252417.625, 11: 232421.97674418605, 6: 222940.65384615384, 20: 217075.25714285715, 13: 196429.34285714285, 15: 194200.10256410256, 17: 187720.85, 16: 183627.7741935484, 23: 171595.66666666666, 1: 161614.37037037036, 26: 160571.625, 3: 160286.88888888888, 28: 156789.125, 4: 156419.44444444444, 7: 149241.60869565216, 8: 125400.95} awards {0: 1.8, 5: 1.6136363636363635, 24: 1.48, 29: 1.4090909090909092, 15: 1.3846153846153846, 9: 1.375, 21: 1.375, 26: 1.375, 28: 1.375, 16: 1.3548387096774193, 25: 1.3333333333333333, 20: 1.3142857142857143, 18: 1.3, 27: 1.3, 4: 1.2962962962962963, 10: 1.2727272727272727, 7: 1.2608695652173914, 11: 1.2093023255813953, 2: 1.2045454545454546, 12: 1.175, 8: 1.15, 1: 1.1481481481481481, 14: 1.146341463414634, 19: 1.1304347826086956, 6: 1.1153846153846154, 13: 1.0285714285714285, 3: 1.0, 17: 1.0, 22: 1.0, 23: 1.0} revenue {25: 484269498.0, 0: 363300703.6666667, 5: 359467640.5, 24: 355977710.12, 2: 225819281.5, 10: 219592406.0, 9: 202689694.0357143, 6: 193954937.3846154, 11: 191145326.4651163, 20: 174853658.68571427, 12: 169447673.375, 19: 166906361.6956522, 21: 164709107.875, 15: 161173959.3846154, 28: 156486793.0625, 18: 149940884.95, 29: 141779915.04545453, 8: 140247419.55, 3: 136053405.1111111, 17: 133302507.7, 4: 126560091.8888889, 16: 119018465.5483871, 14: 111575817.92682926, 27: 110594605.1, 22: 106671979.9090909, 7: 100859043.6521739, 23: 100777022.4, 13: 96225646.51428571, 1: 95796662.4074074, 26: 85588361.41666667} popularity_score {25: 73.13332877777779, 10: 60.13670738636363, 0: 59.66313553333334, 5: 59.543805613636344, 9: 54.64372623214286, 22: 51.05131840909092, 24: 50.51996491999999, 12: 44.20950965, 19: 42.762771304347815, 2: 42.76051827272727, 6: 42.55347638461538, 18: 40.282002799999994, 21: 37.529143375, 4: 37.10236970370371, 14: 36.50008470731707, 15: 34.89498387179487, 27: 34.42673523333334, 11: 34.29388260465117, 29: 33.43361127272727, 3: 31.32293883333333, 7: 30.41337647826087, 1: 29.85630872222223, 20: 29.72496308571429, 13: 28.964541828571424, 16: 28.11072774193549, 28: 28.00309025, 17: 27.899624450000005, 23: 26.56532166666667, 26: 23.837679458333337, 8: 21.318576399999998}
rmse = 0
import math
import numpy as np
from collections import defaultdict
pred = defaultdict(list)
fields = defaultdict(int)
for i in x_test:
movie_encoding = embedder.encode([data[i]['overview']])[0]
cluster = clustering_model.predict([movie_encoding])[0]
for label in yLabels:
fields[label] += math.sqrt((mappings[label][cluster] - data[i][label])**2 / len(x_test))
pred[label].append(mappings[label][cluster])
for label in fields:
print(f"{label} NRMSE")
print(fields[label] / (max([data[i][label] for i in data]) - min([data[i][label] for i in data])))
print("\n")
imdb_rating NRMSE 1.2036487916723635 metascore NRMSE 2.572173657582724 imdb_votes NRMSE 1.1581800552443429 awards NRMSE 1.2196819745150511 revenue NRMSE 0.8028939854929531 popularity_score NRMSE 0.4950142758816381
import matplotlib.pyplot as plt
line = plt.plot([data[i]['revenue'] for i in x_test]) #[i for i in range(len(x_test))],
plt.plot(pred['revenue'], color="red") #[i for i in range(len(x_test))]
plt.show()
line = plt.plot([i for i in range(len(x_test))], [data[i]['imdb_rating'] for i in x_test])
plt.plot([i for i in range(len(x_test))], [pred['imdb_rating'][i] for i in range(len(x_test))], color="red")
plt.show()
line = plt.plot([i for i in range(len(x_test))], [data[i]['popularity_score'] for i in x_test])
plt.plot([i for i in range(len(x_test))], pred['popularity_score'], color="red")
plt.show()
line = plt.plot([i for i in range(len(x_test))], [data[i]['metascore'] for i in x_test])
plt.plot([i for i in range(len(x_test))], [pred['metascore'][i] for i in range(len(x_test))], color="red")
plt.show()