Final_ML_Project / src /modeling.py
Bardi-ya's picture
Upload 51 files
c296592 verified
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from surprise import Dataset, Reader, KNNBasic, SVD, accuracy
from surprise.model_selection import train_test_split, GridSearchCV
import joblib
class RecommenderModels:
def __init__(self, merged_df_with_tfidf, unique_movies_reduced, ratings_df):
self.merged_df_with_tfidf = merged_df_with_tfidf
self.unique_movies_reduced = unique_movies_reduced
self.ratings_df = ratings_df
self.popular_movies_unique = None
self.user_profiles = None
self.knn_user_based = None
self.svd_mf = None
self.svd_mf_tuned = None
self.best_alpha = None
self.model_dir = "models"
import os
os.makedirs(self.model_dir, exist_ok=True)
# ---------- Popularity Baseline ----------
def fit_popularity(self):
C = self.unique_movies_reduced['vote_average'].mean()
m = self.unique_movies_reduced['vote_count'].quantile(0.90)
qualified = self.unique_movies_reduced[self.unique_movies_reduced['vote_count'] >= m].copy()
def weighted_rating(x):
v, R = x['vote_count'], x['vote_average']
return (v / (v + m) * R) + (m / (v + m) * C)
qualified['weighted_rating'] = qualified.apply(weighted_rating, axis=1)
popular = qualified.sort_values('weighted_rating', ascending=False)
self.popular_movies_unique = popular.groupby('movieId').first().reset_index()
# ---------- Content-Based ----------
def fit_content_based(self):
movie_id_to_index = pd.Series(self.unique_movies_reduced.index, index=self.unique_movies_reduced['movieId']).to_dict()
svd_features = self.unique_movies_reduced.filter(like='svd_')
self.user_profiles = {}
for user_id in self.unique_movies_reduced['userId'].unique():
user_ratings = self.unique_movies_reduced[self.unique_movies_reduced['userId'] == user_id][['movieId', 'rating']]
profile = np.zeros(svd_features.shape[1])
total_weight = 0
for _, row in user_ratings.iterrows():
idx = movie_id_to_index.get(int(row['movieId']))
if idx is not None:
profile += svd_features.loc[idx].values * row['rating']
total_weight += row['rating']
if total_weight > 0:
profile /= total_weight
self.user_profiles[user_id] = profile
def get_content_based_recommendations(self, user_id, top_n=10):
if self.user_profiles is None:
raise ValueError("Call fit_content_based() first.")
if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0):
if self.popular_movies_unique is not None:
return self.popular_movies_unique[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(top_n)
return pd.DataFrame()
user_profile = self.user_profiles[user_id]
svd_features = self.unique_movies_reduced.filter(like='svd_')
sim_scores = cosine_similarity(user_profile.reshape(1, -1), svd_features)[0]
rated_ids = self.merged_df_with_tfidf[self.merged_df_with_tfidf['userId'] == user_id]['movieId'].tolist()
indices = [i for i, row in self.unique_movies_reduced.iterrows() if row['movieId'] not in rated_ids]
top_indices = np.argsort(sim_scores[indices])[::-1][:top_n]
recs = self.unique_movies_reduced.iloc[[indices[i] for i in top_indices]][['title', 'vote_average', 'vote_count']]
return recs.reset_index(drop=True)
def get_content_based_score(self, user_id, movie_id):
if self.user_profiles is None:
raise ValueError("Call fit_content_based() first.")
if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0):
return 0.0
user_profile = self.user_profiles[user_id]
idx = self.unique_movies_reduced[self.unique_movies_reduced['movieId'] == movie_id].index
if idx.empty:
return 0.0
movie_features = self.unique_movies_reduced.loc[idx].filter(like='svd_').values
return cosine_similarity(user_profile.reshape(1, -1), movie_features.reshape(1, -1))[0][0]
# ---------- Collaborative Filtering ----------
def fit_cf(self):
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(self.ratings_df[['userId', 'movieId', 'rating']], reader)
self.data = data
self.trainset, self.testset = train_test_split(data, test_size=0.2, random_state=42)
self.knn_user_based = KNNBasic(sim_options={'user_based': True, 'similarity': 'cosine'}, k=40)
self.knn_user_based.fit(self.trainset)
self.svd_mf = SVD(random_state=42)
self.svd_mf.fit(self.trainset)
def evaluate_cf(self):
preds_knn = self.knn_user_based.test(self.testset)
preds_svd = self.svd_mf.test(self.testset)
rmse_knn = accuracy.rmse(preds_knn)
rmse_svd = accuracy.rmse(preds_svd)
return rmse_knn, rmse_svd
# ---------- Hybrid Model ----------
def hybrid_prediction(self, user_id, movie_id, alpha):
cb_score = self.get_content_based_score(user_id, movie_id)
try:
cf1_pred = self.knn_user_based.predict(str(user_id), str(movie_id)).est
except Exception:
cf1_pred = 0
try:
cf2_pred = self.svd_mf.predict(str(user_id), str(movie_id)).est
except Exception:
cf2_pred = 0
cf_score = (cf1_pred + cf2_pred) / 2.0
return alpha * cf_score + (1 - alpha) * cb_score
def tune_hybrid_alpha(self, alphas=None):
if alphas is None:
alphas = np.arange(0, 1.01, 0.5)
testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating'])
# Recreate user profiles from trainset
train_ratings_df = pd.DataFrame(self.trainset.all_ratings(), columns=['uid', 'iid', 'rating'])
train_ratings_df['userId'] = train_ratings_df['uid'].apply(lambda x: self.trainset.to_raw_uid(x))
train_ratings_df['movieId'] = train_ratings_df['iid'].apply(lambda x: self.trainset.to_raw_iid(x))
train_ratings_df = train_ratings_df[['userId', 'movieId', 'rating']]
self.fit_content_based() # Ensure user_profiles is up to date
rmse_scores = {}
for alpha in alphas:
preds, actuals = [], []
for _, row in testset_df.iterrows():
pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), alpha)
preds.append(pred)
actuals.append(row['rating'])
rmse = np.sqrt(mean_squared_error(actuals, preds))
rmse_scores[alpha] = rmse
self.best_alpha = min(rmse_scores, key=rmse_scores.get)
return rmse_scores, self.best_alpha
def fit_svd_gridsearch(self, param_grid=None):
if param_grid is None:
param_grid = {
'n_factors': [50, 100, 150],
'lr_all': [0.002, 0.005, 0.01],
'reg_all': [0.02, 0.05, 0.1]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(self.data)
self.svd_mf_tuned = SVD(**gs.best_params['rmse'])
self.svd_mf_tuned.fit(self.data.build_full_trainset())
return gs.best_score['rmse'], gs.best_params['rmse']
def evaluate_hybrid(self):
testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating'])
preds, actuals = [], []
for _, row in testset_df.iterrows():
pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), self.best_alpha)
preds.append(pred)
actuals.append(row['rating'])
rmse = np.sqrt(mean_squared_error(actuals, preds))
return rmse
def save_models(self, prefix="recommender"):
# Save collaborative models
joblib.dump(self.knn_user_based, f"{self.model_dir}/{prefix}_knn_user_based.pkl")
joblib.dump(self.svd_mf, f"{self.model_dir}/{prefix}_svd_mf.pkl")
if self.svd_mf_tuned is not None:
joblib.dump(self.svd_mf_tuned, f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl")
# Save user profiles and other numpy/pandas objects
joblib.dump(self.user_profiles, f"{self.model_dir}/{prefix}_user_profiles.pkl")
joblib.dump(self.popular_movies_unique, f"{self.model_dir}/{prefix}_popular_movies_unique.pkl")
joblib.dump(self.unique_movies_reduced, f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl")
joblib.dump(self.merged_df_with_tfidf, f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl")
print(f"Models and data saved to {self.model_dir}/")
def load_models(self, prefix="recommender"):
# Load collaborative models
self.knn_user_based = joblib.load(f"{self.model_dir}/{prefix}_knn_user_based.pkl")
self.svd_mf = joblib.load(f"{self.model_dir}/{prefix}_svd_mf.pkl")
try:
self.svd_mf_tuned = joblib.load(f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl")
except Exception:
self.svd_mf_tuned = None
self.user_profiles = joblib.load(f"{self.model_dir}/{prefix}_user_profiles.pkl")
self.popular_movies_unique = joblib.load(f"{self.model_dir}/{prefix}_popular_movies_unique.pkl")
self.unique_movies_reduced = joblib.load(f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl")
self.merged_df_with_tfidf = joblib.load(f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl")
print(f"Models and data loaded from {self.model_dir}/")
# Example usage:
# models = RecommenderModels(merged_df_with_tfidf, unique_movies_reduced, ratings_df)
# models.fit_popularity()
# models.fit_content_based()
# models.fit_cf()
# print(models.evaluate_cf())
# rmse_scores, best_alpha = models.tune_hybrid_alpha()
# print("Best alpha:", best_alpha)
# print("Hybrid RMSE:", models.evaluate_hybrid())