Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.metrics import mean_squared_error | |
| from surprise import Dataset, Reader, KNNBasic, SVD, accuracy | |
| from surprise.model_selection import train_test_split, GridSearchCV | |
| import joblib | |
| class RecommenderModels: | |
| def __init__(self, merged_df_with_tfidf, unique_movies_reduced, ratings_df): | |
| self.merged_df_with_tfidf = merged_df_with_tfidf | |
| self.unique_movies_reduced = unique_movies_reduced | |
| self.ratings_df = ratings_df | |
| self.popular_movies_unique = None | |
| self.user_profiles = None | |
| self.knn_user_based = None | |
| self.svd_mf = None | |
| self.svd_mf_tuned = None | |
| self.best_alpha = None | |
| self.model_dir = "models" | |
| import os | |
| os.makedirs(self.model_dir, exist_ok=True) | |
| # ---------- Popularity Baseline ---------- | |
| def fit_popularity(self): | |
| C = self.unique_movies_reduced['vote_average'].mean() | |
| m = self.unique_movies_reduced['vote_count'].quantile(0.90) | |
| qualified = self.unique_movies_reduced[self.unique_movies_reduced['vote_count'] >= m].copy() | |
| def weighted_rating(x): | |
| v, R = x['vote_count'], x['vote_average'] | |
| return (v / (v + m) * R) + (m / (v + m) * C) | |
| qualified['weighted_rating'] = qualified.apply(weighted_rating, axis=1) | |
| popular = qualified.sort_values('weighted_rating', ascending=False) | |
| self.popular_movies_unique = popular.groupby('movieId').first().reset_index() | |
| # ---------- Content-Based ---------- | |
| def fit_content_based(self): | |
| movie_id_to_index = pd.Series(self.unique_movies_reduced.index, index=self.unique_movies_reduced['movieId']).to_dict() | |
| svd_features = self.unique_movies_reduced.filter(like='svd_') | |
| self.user_profiles = {} | |
| for user_id in self.unique_movies_reduced['userId'].unique(): | |
| user_ratings = self.unique_movies_reduced[self.unique_movies_reduced['userId'] == user_id][['movieId', 'rating']] | |
| profile = np.zeros(svd_features.shape[1]) | |
| total_weight = 0 | |
| for _, row in user_ratings.iterrows(): | |
| idx = movie_id_to_index.get(int(row['movieId'])) | |
| if idx is not None: | |
| profile += svd_features.loc[idx].values * row['rating'] | |
| total_weight += row['rating'] | |
| if total_weight > 0: | |
| profile /= total_weight | |
| self.user_profiles[user_id] = profile | |
| def get_content_based_recommendations(self, user_id, top_n=10): | |
| if self.user_profiles is None: | |
| raise ValueError("Call fit_content_based() first.") | |
| if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0): | |
| if self.popular_movies_unique is not None: | |
| return self.popular_movies_unique[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(top_n) | |
| return pd.DataFrame() | |
| user_profile = self.user_profiles[user_id] | |
| svd_features = self.unique_movies_reduced.filter(like='svd_') | |
| sim_scores = cosine_similarity(user_profile.reshape(1, -1), svd_features)[0] | |
| rated_ids = self.merged_df_with_tfidf[self.merged_df_with_tfidf['userId'] == user_id]['movieId'].tolist() | |
| indices = [i for i, row in self.unique_movies_reduced.iterrows() if row['movieId'] not in rated_ids] | |
| top_indices = np.argsort(sim_scores[indices])[::-1][:top_n] | |
| recs = self.unique_movies_reduced.iloc[[indices[i] for i in top_indices]][['title', 'vote_average', 'vote_count']] | |
| return recs.reset_index(drop=True) | |
| def get_content_based_score(self, user_id, movie_id): | |
| if self.user_profiles is None: | |
| raise ValueError("Call fit_content_based() first.") | |
| if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0): | |
| return 0.0 | |
| user_profile = self.user_profiles[user_id] | |
| idx = self.unique_movies_reduced[self.unique_movies_reduced['movieId'] == movie_id].index | |
| if idx.empty: | |
| return 0.0 | |
| movie_features = self.unique_movies_reduced.loc[idx].filter(like='svd_').values | |
| return cosine_similarity(user_profile.reshape(1, -1), movie_features.reshape(1, -1))[0][0] | |
| # ---------- Collaborative Filtering ---------- | |
| def fit_cf(self): | |
| reader = Reader(rating_scale=(1, 5)) | |
| data = Dataset.load_from_df(self.ratings_df[['userId', 'movieId', 'rating']], reader) | |
| self.data = data | |
| self.trainset, self.testset = train_test_split(data, test_size=0.2, random_state=42) | |
| self.knn_user_based = KNNBasic(sim_options={'user_based': True, 'similarity': 'cosine'}, k=40) | |
| self.knn_user_based.fit(self.trainset) | |
| self.svd_mf = SVD(random_state=42) | |
| self.svd_mf.fit(self.trainset) | |
| def evaluate_cf(self): | |
| preds_knn = self.knn_user_based.test(self.testset) | |
| preds_svd = self.svd_mf.test(self.testset) | |
| rmse_knn = accuracy.rmse(preds_knn) | |
| rmse_svd = accuracy.rmse(preds_svd) | |
| return rmse_knn, rmse_svd | |
| # ---------- Hybrid Model ---------- | |
| def hybrid_prediction(self, user_id, movie_id, alpha): | |
| cb_score = self.get_content_based_score(user_id, movie_id) | |
| try: | |
| cf1_pred = self.knn_user_based.predict(str(user_id), str(movie_id)).est | |
| except Exception: | |
| cf1_pred = 0 | |
| try: | |
| cf2_pred = self.svd_mf.predict(str(user_id), str(movie_id)).est | |
| except Exception: | |
| cf2_pred = 0 | |
| cf_score = (cf1_pred + cf2_pred) / 2.0 | |
| return alpha * cf_score + (1 - alpha) * cb_score | |
| def tune_hybrid_alpha(self, alphas=None): | |
| if alphas is None: | |
| alphas = np.arange(0, 1.01, 0.5) | |
| testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating']) | |
| # Recreate user profiles from trainset | |
| train_ratings_df = pd.DataFrame(self.trainset.all_ratings(), columns=['uid', 'iid', 'rating']) | |
| train_ratings_df['userId'] = train_ratings_df['uid'].apply(lambda x: self.trainset.to_raw_uid(x)) | |
| train_ratings_df['movieId'] = train_ratings_df['iid'].apply(lambda x: self.trainset.to_raw_iid(x)) | |
| train_ratings_df = train_ratings_df[['userId', 'movieId', 'rating']] | |
| self.fit_content_based() # Ensure user_profiles is up to date | |
| rmse_scores = {} | |
| for alpha in alphas: | |
| preds, actuals = [], [] | |
| for _, row in testset_df.iterrows(): | |
| pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), alpha) | |
| preds.append(pred) | |
| actuals.append(row['rating']) | |
| rmse = np.sqrt(mean_squared_error(actuals, preds)) | |
| rmse_scores[alpha] = rmse | |
| self.best_alpha = min(rmse_scores, key=rmse_scores.get) | |
| return rmse_scores, self.best_alpha | |
| def fit_svd_gridsearch(self, param_grid=None): | |
| if param_grid is None: | |
| param_grid = { | |
| 'n_factors': [50, 100, 150], | |
| 'lr_all': [0.002, 0.005, 0.01], | |
| 'reg_all': [0.02, 0.05, 0.1] | |
| } | |
| gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) | |
| gs.fit(self.data) | |
| self.svd_mf_tuned = SVD(**gs.best_params['rmse']) | |
| self.svd_mf_tuned.fit(self.data.build_full_trainset()) | |
| return gs.best_score['rmse'], gs.best_params['rmse'] | |
| def evaluate_hybrid(self): | |
| testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating']) | |
| preds, actuals = [], [] | |
| for _, row in testset_df.iterrows(): | |
| pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), self.best_alpha) | |
| preds.append(pred) | |
| actuals.append(row['rating']) | |
| rmse = np.sqrt(mean_squared_error(actuals, preds)) | |
| return rmse | |
| def save_models(self, prefix="recommender"): | |
| # Save collaborative models | |
| joblib.dump(self.knn_user_based, f"{self.model_dir}/{prefix}_knn_user_based.pkl") | |
| joblib.dump(self.svd_mf, f"{self.model_dir}/{prefix}_svd_mf.pkl") | |
| if self.svd_mf_tuned is not None: | |
| joblib.dump(self.svd_mf_tuned, f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl") | |
| # Save user profiles and other numpy/pandas objects | |
| joblib.dump(self.user_profiles, f"{self.model_dir}/{prefix}_user_profiles.pkl") | |
| joblib.dump(self.popular_movies_unique, f"{self.model_dir}/{prefix}_popular_movies_unique.pkl") | |
| joblib.dump(self.unique_movies_reduced, f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl") | |
| joblib.dump(self.merged_df_with_tfidf, f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl") | |
| print(f"Models and data saved to {self.model_dir}/") | |
| def load_models(self, prefix="recommender"): | |
| # Load collaborative models | |
| self.knn_user_based = joblib.load(f"{self.model_dir}/{prefix}_knn_user_based.pkl") | |
| self.svd_mf = joblib.load(f"{self.model_dir}/{prefix}_svd_mf.pkl") | |
| try: | |
| self.svd_mf_tuned = joblib.load(f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl") | |
| except Exception: | |
| self.svd_mf_tuned = None | |
| self.user_profiles = joblib.load(f"{self.model_dir}/{prefix}_user_profiles.pkl") | |
| self.popular_movies_unique = joblib.load(f"{self.model_dir}/{prefix}_popular_movies_unique.pkl") | |
| self.unique_movies_reduced = joblib.load(f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl") | |
| self.merged_df_with_tfidf = joblib.load(f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl") | |
| print(f"Models and data loaded from {self.model_dir}/") | |
| # Example usage: | |
| # models = RecommenderModels(merged_df_with_tfidf, unique_movies_reduced, ratings_df) | |
| # models.fit_popularity() | |
| # models.fit_content_based() | |
| # models.fit_cf() | |
| # print(models.evaluate_cf()) | |
| # rmse_scores, best_alpha = models.tune_hybrid_alpha() | |
| # print("Best alpha:", best_alpha) | |
| # print("Hybrid RMSE:", models.evaluate_hybrid()) |