Spaces:
Sleeping
Sleeping
File size: 10,261 Bytes
c296592 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from surprise import Dataset, Reader, KNNBasic, SVD, accuracy
from surprise.model_selection import train_test_split, GridSearchCV
import joblib
class RecommenderModels:
def __init__(self, merged_df_with_tfidf, unique_movies_reduced, ratings_df):
self.merged_df_with_tfidf = merged_df_with_tfidf
self.unique_movies_reduced = unique_movies_reduced
self.ratings_df = ratings_df
self.popular_movies_unique = None
self.user_profiles = None
self.knn_user_based = None
self.svd_mf = None
self.svd_mf_tuned = None
self.best_alpha = None
self.model_dir = "models"
import os
os.makedirs(self.model_dir, exist_ok=True)
# ---------- Popularity Baseline ----------
def fit_popularity(self):
C = self.unique_movies_reduced['vote_average'].mean()
m = self.unique_movies_reduced['vote_count'].quantile(0.90)
qualified = self.unique_movies_reduced[self.unique_movies_reduced['vote_count'] >= m].copy()
def weighted_rating(x):
v, R = x['vote_count'], x['vote_average']
return (v / (v + m) * R) + (m / (v + m) * C)
qualified['weighted_rating'] = qualified.apply(weighted_rating, axis=1)
popular = qualified.sort_values('weighted_rating', ascending=False)
self.popular_movies_unique = popular.groupby('movieId').first().reset_index()
# ---------- Content-Based ----------
def fit_content_based(self):
movie_id_to_index = pd.Series(self.unique_movies_reduced.index, index=self.unique_movies_reduced['movieId']).to_dict()
svd_features = self.unique_movies_reduced.filter(like='svd_')
self.user_profiles = {}
for user_id in self.unique_movies_reduced['userId'].unique():
user_ratings = self.unique_movies_reduced[self.unique_movies_reduced['userId'] == user_id][['movieId', 'rating']]
profile = np.zeros(svd_features.shape[1])
total_weight = 0
for _, row in user_ratings.iterrows():
idx = movie_id_to_index.get(int(row['movieId']))
if idx is not None:
profile += svd_features.loc[idx].values * row['rating']
total_weight += row['rating']
if total_weight > 0:
profile /= total_weight
self.user_profiles[user_id] = profile
def get_content_based_recommendations(self, user_id, top_n=10):
if self.user_profiles is None:
raise ValueError("Call fit_content_based() first.")
if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0):
if self.popular_movies_unique is not None:
return self.popular_movies_unique[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(top_n)
return pd.DataFrame()
user_profile = self.user_profiles[user_id]
svd_features = self.unique_movies_reduced.filter(like='svd_')
sim_scores = cosine_similarity(user_profile.reshape(1, -1), svd_features)[0]
rated_ids = self.merged_df_with_tfidf[self.merged_df_with_tfidf['userId'] == user_id]['movieId'].tolist()
indices = [i for i, row in self.unique_movies_reduced.iterrows() if row['movieId'] not in rated_ids]
top_indices = np.argsort(sim_scores[indices])[::-1][:top_n]
recs = self.unique_movies_reduced.iloc[[indices[i] for i in top_indices]][['title', 'vote_average', 'vote_count']]
return recs.reset_index(drop=True)
def get_content_based_score(self, user_id, movie_id):
if self.user_profiles is None:
raise ValueError("Call fit_content_based() first.")
if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0):
return 0.0
user_profile = self.user_profiles[user_id]
idx = self.unique_movies_reduced[self.unique_movies_reduced['movieId'] == movie_id].index
if idx.empty:
return 0.0
movie_features = self.unique_movies_reduced.loc[idx].filter(like='svd_').values
return cosine_similarity(user_profile.reshape(1, -1), movie_features.reshape(1, -1))[0][0]
# ---------- Collaborative Filtering ----------
def fit_cf(self):
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(self.ratings_df[['userId', 'movieId', 'rating']], reader)
self.data = data
self.trainset, self.testset = train_test_split(data, test_size=0.2, random_state=42)
self.knn_user_based = KNNBasic(sim_options={'user_based': True, 'similarity': 'cosine'}, k=40)
self.knn_user_based.fit(self.trainset)
self.svd_mf = SVD(random_state=42)
self.svd_mf.fit(self.trainset)
def evaluate_cf(self):
preds_knn = self.knn_user_based.test(self.testset)
preds_svd = self.svd_mf.test(self.testset)
rmse_knn = accuracy.rmse(preds_knn)
rmse_svd = accuracy.rmse(preds_svd)
return rmse_knn, rmse_svd
# ---------- Hybrid Model ----------
def hybrid_prediction(self, user_id, movie_id, alpha):
cb_score = self.get_content_based_score(user_id, movie_id)
try:
cf1_pred = self.knn_user_based.predict(str(user_id), str(movie_id)).est
except Exception:
cf1_pred = 0
try:
cf2_pred = self.svd_mf.predict(str(user_id), str(movie_id)).est
except Exception:
cf2_pred = 0
cf_score = (cf1_pred + cf2_pred) / 2.0
return alpha * cf_score + (1 - alpha) * cb_score
def tune_hybrid_alpha(self, alphas=None):
if alphas is None:
alphas = np.arange(0, 1.01, 0.5)
testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating'])
# Recreate user profiles from trainset
train_ratings_df = pd.DataFrame(self.trainset.all_ratings(), columns=['uid', 'iid', 'rating'])
train_ratings_df['userId'] = train_ratings_df['uid'].apply(lambda x: self.trainset.to_raw_uid(x))
train_ratings_df['movieId'] = train_ratings_df['iid'].apply(lambda x: self.trainset.to_raw_iid(x))
train_ratings_df = train_ratings_df[['userId', 'movieId', 'rating']]
self.fit_content_based() # Ensure user_profiles is up to date
rmse_scores = {}
for alpha in alphas:
preds, actuals = [], []
for _, row in testset_df.iterrows():
pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), alpha)
preds.append(pred)
actuals.append(row['rating'])
rmse = np.sqrt(mean_squared_error(actuals, preds))
rmse_scores[alpha] = rmse
self.best_alpha = min(rmse_scores, key=rmse_scores.get)
return rmse_scores, self.best_alpha
def fit_svd_gridsearch(self, param_grid=None):
if param_grid is None:
param_grid = {
'n_factors': [50, 100, 150],
'lr_all': [0.002, 0.005, 0.01],
'reg_all': [0.02, 0.05, 0.1]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(self.data)
self.svd_mf_tuned = SVD(**gs.best_params['rmse'])
self.svd_mf_tuned.fit(self.data.build_full_trainset())
return gs.best_score['rmse'], gs.best_params['rmse']
def evaluate_hybrid(self):
testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating'])
preds, actuals = [], []
for _, row in testset_df.iterrows():
pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), self.best_alpha)
preds.append(pred)
actuals.append(row['rating'])
rmse = np.sqrt(mean_squared_error(actuals, preds))
return rmse
def save_models(self, prefix="recommender"):
# Save collaborative models
joblib.dump(self.knn_user_based, f"{self.model_dir}/{prefix}_knn_user_based.pkl")
joblib.dump(self.svd_mf, f"{self.model_dir}/{prefix}_svd_mf.pkl")
if self.svd_mf_tuned is not None:
joblib.dump(self.svd_mf_tuned, f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl")
# Save user profiles and other numpy/pandas objects
joblib.dump(self.user_profiles, f"{self.model_dir}/{prefix}_user_profiles.pkl")
joblib.dump(self.popular_movies_unique, f"{self.model_dir}/{prefix}_popular_movies_unique.pkl")
joblib.dump(self.unique_movies_reduced, f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl")
joblib.dump(self.merged_df_with_tfidf, f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl")
print(f"Models and data saved to {self.model_dir}/")
def load_models(self, prefix="recommender"):
# Load collaborative models
self.knn_user_based = joblib.load(f"{self.model_dir}/{prefix}_knn_user_based.pkl")
self.svd_mf = joblib.load(f"{self.model_dir}/{prefix}_svd_mf.pkl")
try:
self.svd_mf_tuned = joblib.load(f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl")
except Exception:
self.svd_mf_tuned = None
self.user_profiles = joblib.load(f"{self.model_dir}/{prefix}_user_profiles.pkl")
self.popular_movies_unique = joblib.load(f"{self.model_dir}/{prefix}_popular_movies_unique.pkl")
self.unique_movies_reduced = joblib.load(f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl")
self.merged_df_with_tfidf = joblib.load(f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl")
print(f"Models and data loaded from {self.model_dir}/")
# Example usage:
# models = RecommenderModels(merged_df_with_tfidf, unique_movies_reduced, ratings_df)
# models.fit_popularity()
# models.fit_content_based()
# models.fit_cf()
# print(models.evaluate_cf())
# rmse_scores, best_alpha = models.tune_hybrid_alpha()
# print("Best alpha:", best_alpha)
# print("Hybrid RMSE:", models.evaluate_hybrid()) |