File size: 10,261 Bytes
c296592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from surprise import Dataset, Reader, KNNBasic, SVD, accuracy
from surprise.model_selection import train_test_split, GridSearchCV
import joblib

class RecommenderModels:
    def __init__(self, merged_df_with_tfidf, unique_movies_reduced, ratings_df):
        self.merged_df_with_tfidf = merged_df_with_tfidf
        self.unique_movies_reduced = unique_movies_reduced
        self.ratings_df = ratings_df
        self.popular_movies_unique = None
        self.user_profiles = None
        self.knn_user_based = None
        self.svd_mf = None
        self.svd_mf_tuned = None
        self.best_alpha = None
        self.model_dir = "models"
        import os
        os.makedirs(self.model_dir, exist_ok=True)

    # ---------- Popularity Baseline ----------
    def fit_popularity(self):
        C = self.unique_movies_reduced['vote_average'].mean()
        m = self.unique_movies_reduced['vote_count'].quantile(0.90)
        qualified = self.unique_movies_reduced[self.unique_movies_reduced['vote_count'] >= m].copy()
        def weighted_rating(x):
            v, R = x['vote_count'], x['vote_average']
            return (v / (v + m) * R) + (m / (v + m) * C)
        qualified['weighted_rating'] = qualified.apply(weighted_rating, axis=1)
        popular = qualified.sort_values('weighted_rating', ascending=False)
        self.popular_movies_unique = popular.groupby('movieId').first().reset_index()

    # ---------- Content-Based ----------
    def fit_content_based(self):
        movie_id_to_index = pd.Series(self.unique_movies_reduced.index, index=self.unique_movies_reduced['movieId']).to_dict()
        svd_features = self.unique_movies_reduced.filter(like='svd_')
        self.user_profiles = {}
        for user_id in self.unique_movies_reduced['userId'].unique():
            user_ratings = self.unique_movies_reduced[self.unique_movies_reduced['userId'] == user_id][['movieId', 'rating']]
            profile = np.zeros(svd_features.shape[1])
            total_weight = 0
            for _, row in user_ratings.iterrows():
                idx = movie_id_to_index.get(int(row['movieId']))
                if idx is not None:
                    profile += svd_features.loc[idx].values * row['rating']
                    total_weight += row['rating']
            if total_weight > 0:
                profile /= total_weight
            self.user_profiles[user_id] = profile

    def get_content_based_recommendations(self, user_id, top_n=10):
        if self.user_profiles is None:
            raise ValueError("Call fit_content_based() first.")
        if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0):
            if self.popular_movies_unique is not None:
                return self.popular_movies_unique[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(top_n)
            return pd.DataFrame()
        user_profile = self.user_profiles[user_id]
        svd_features = self.unique_movies_reduced.filter(like='svd_')
        sim_scores = cosine_similarity(user_profile.reshape(1, -1), svd_features)[0]
        rated_ids = self.merged_df_with_tfidf[self.merged_df_with_tfidf['userId'] == user_id]['movieId'].tolist()
        indices = [i for i, row in self.unique_movies_reduced.iterrows() if row['movieId'] not in rated_ids]
        top_indices = np.argsort(sim_scores[indices])[::-1][:top_n]
        recs = self.unique_movies_reduced.iloc[[indices[i] for i in top_indices]][['title', 'vote_average', 'vote_count']]
        return recs.reset_index(drop=True)

    def get_content_based_score(self, user_id, movie_id):
        if self.user_profiles is None:
            raise ValueError("Call fit_content_based() first.")
        if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0):
            return 0.0
        user_profile = self.user_profiles[user_id]
        idx = self.unique_movies_reduced[self.unique_movies_reduced['movieId'] == movie_id].index
        if idx.empty:
            return 0.0
        movie_features = self.unique_movies_reduced.loc[idx].filter(like='svd_').values
        return cosine_similarity(user_profile.reshape(1, -1), movie_features.reshape(1, -1))[0][0]

    # ---------- Collaborative Filtering ----------
    def fit_cf(self):
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(self.ratings_df[['userId', 'movieId', 'rating']], reader)
        self.data = data
        self.trainset, self.testset = train_test_split(data, test_size=0.2, random_state=42)
        self.knn_user_based = KNNBasic(sim_options={'user_based': True, 'similarity': 'cosine'}, k=40)
        self.knn_user_based.fit(self.trainset)
        self.svd_mf = SVD(random_state=42)
        self.svd_mf.fit(self.trainset)

    def evaluate_cf(self):
        preds_knn = self.knn_user_based.test(self.testset)
        preds_svd = self.svd_mf.test(self.testset)
        rmse_knn = accuracy.rmse(preds_knn)
        rmse_svd = accuracy.rmse(preds_svd)
        return rmse_knn, rmse_svd

    # ---------- Hybrid Model ----------
    def hybrid_prediction(self, user_id, movie_id, alpha):
        cb_score = self.get_content_based_score(user_id, movie_id)
        try:
            cf1_pred = self.knn_user_based.predict(str(user_id), str(movie_id)).est
        except Exception:
            cf1_pred = 0
        try:
            cf2_pred = self.svd_mf.predict(str(user_id), str(movie_id)).est
        except Exception:
            cf2_pred = 0
        cf_score = (cf1_pred + cf2_pred) / 2.0
        return alpha * cf_score + (1 - alpha) * cb_score

    def tune_hybrid_alpha(self, alphas=None):
        if alphas is None:
            alphas = np.arange(0, 1.01, 0.5)
        testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating'])
        # Recreate user profiles from trainset
        train_ratings_df = pd.DataFrame(self.trainset.all_ratings(), columns=['uid', 'iid', 'rating'])
        train_ratings_df['userId'] = train_ratings_df['uid'].apply(lambda x: self.trainset.to_raw_uid(x))
        train_ratings_df['movieId'] = train_ratings_df['iid'].apply(lambda x: self.trainset.to_raw_iid(x))
        train_ratings_df = train_ratings_df[['userId', 'movieId', 'rating']]
        self.fit_content_based()  # Ensure user_profiles is up to date
        rmse_scores = {}
        for alpha in alphas:
            preds, actuals = [], []
            for _, row in testset_df.iterrows():
                pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), alpha)
                preds.append(pred)
                actuals.append(row['rating'])
            rmse = np.sqrt(mean_squared_error(actuals, preds))
            rmse_scores[alpha] = rmse
        self.best_alpha = min(rmse_scores, key=rmse_scores.get)
        return rmse_scores, self.best_alpha

    def fit_svd_gridsearch(self, param_grid=None):
        if param_grid is None:
            param_grid = {
                'n_factors': [50, 100, 150],
                'lr_all': [0.002, 0.005, 0.01],
                'reg_all': [0.02, 0.05, 0.1]
            }
        gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
        gs.fit(self.data)
        self.svd_mf_tuned = SVD(**gs.best_params['rmse'])
        self.svd_mf_tuned.fit(self.data.build_full_trainset())
        return gs.best_score['rmse'], gs.best_params['rmse']

    def evaluate_hybrid(self):
        testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating'])
        preds, actuals = [], []
        for _, row in testset_df.iterrows():
            pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), self.best_alpha)
            preds.append(pred)
            actuals.append(row['rating'])
        rmse = np.sqrt(mean_squared_error(actuals, preds))
        return rmse

    def save_models(self, prefix="recommender"):
        # Save collaborative models
        joblib.dump(self.knn_user_based, f"{self.model_dir}/{prefix}_knn_user_based.pkl")
        joblib.dump(self.svd_mf, f"{self.model_dir}/{prefix}_svd_mf.pkl")
        if self.svd_mf_tuned is not None:
            joblib.dump(self.svd_mf_tuned, f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl")
        # Save user profiles and other numpy/pandas objects
        joblib.dump(self.user_profiles, f"{self.model_dir}/{prefix}_user_profiles.pkl")
        joblib.dump(self.popular_movies_unique, f"{self.model_dir}/{prefix}_popular_movies_unique.pkl")
        joblib.dump(self.unique_movies_reduced, f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl")
        joblib.dump(self.merged_df_with_tfidf, f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl")
        print(f"Models and data saved to {self.model_dir}/")

    def load_models(self, prefix="recommender"):
        # Load collaborative models
        self.knn_user_based = joblib.load(f"{self.model_dir}/{prefix}_knn_user_based.pkl")
        self.svd_mf = joblib.load(f"{self.model_dir}/{prefix}_svd_mf.pkl")
        try:
            self.svd_mf_tuned = joblib.load(f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl")
        except Exception:
            self.svd_mf_tuned = None
        self.user_profiles = joblib.load(f"{self.model_dir}/{prefix}_user_profiles.pkl")
        self.popular_movies_unique = joblib.load(f"{self.model_dir}/{prefix}_popular_movies_unique.pkl")
        self.unique_movies_reduced = joblib.load(f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl")
        self.merged_df_with_tfidf = joblib.load(f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl")
        print(f"Models and data loaded from {self.model_dir}/")

# Example usage:
# models = RecommenderModels(merged_df_with_tfidf, unique_movies_reduced, ratings_df)
# models.fit_popularity()
# models.fit_content_based()
# models.fit_cf()
# print(models.evaluate_cf())
# rmse_scores, best_alpha = models.tune_hybrid_alpha()
# print("Best alpha:", best_alpha)
# print("Hybrid RMSE:", models.evaluate_hybrid())