LvMAC commited on
Commit
354236a
·
verified ·
1 Parent(s): 759cdc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +339 -109
app.py CHANGED
@@ -1,137 +1,367 @@
1
- import pandas as pd
2
  import numpy as np
3
- from surprise import SVD, Dataset, Reader, accuracy
4
- from surprise.model_selection import train_test_split, cross_validate
5
- from collections import defaultdict
6
-
7
- class MovieRecommender:
8
- def __init__(self, ratings_path, movies_path):
9
- # Load data
10
- self.ratings = pd.read_csv(ratings_path)
11
- self.movies = pd.read_csv(movies_path)
12
-
13
- # Build Surprise dataset
14
- reader = Reader(rating_scale=(0.5, 5.0))
15
- self.data = Dataset.load_from_df(
16
- self.ratings[['userId', 'movieId', 'rating']],
17
- reader
18
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # Train model
21
- self.trainset = self.data.build_full_trainset()
22
- self.algo = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
23
- self.algo.fit(self.trainset)
24
-
25
- def recommend_movies(self, user_id, N):
26
- # Get all movie IDs
27
- all_movie_ids = self.movies['movieId'].unique()
28
-
29
- # Get movies user has already rated
30
- rated_movies = self.ratings[self.ratings['userId'] == user_id]['movieId'].values
31
-
32
- # Get unrated movies
33
- unrated_movies = [m for m in all_movie_ids if m not in rated_movies]
34
-
35
- # Predict ratings
36
- predictions = []
37
- for movie_id in unrated_movies:
38
- pred = self.algo.predict(user_id, movie_id)
39
- predictions.append((movie_id, pred.est))
40
-
41
- # Sort by predicted rating
42
- predictions.sort(key=lambda x: x[1], reverse=True)
43
-
44
- # Get top N
45
- top_n = predictions[:N]
46
-
47
- # Merge with movie titles
48
- results = []
49
- for movie_id, score in top_n:
50
- title = self.movies[self.movies['movieId'] == movie_id]['title'].values[0]
51
- results.append({
52
- 'movieId': movie_id,
53
- 'title': title,
54
- 'predicted_rating': round(score, 2)
55
- })
56
-
57
- return results
58
-
59
- def evaluate(self):
60
- # Cross-validation
61
- results = cross_validate(
62
- self.algo,
63
- self.data,
64
- measures=['RMSE', 'MAE'],
65
- cv=5,
66
- verbose=False
67
- )
68
 
69
- # Custom metrics: Precision@K, Recall@K, NDCG@K
70
- trainset, testset = train_test_split(self.data, test_size=0.2)
71
- self.algo.fit(trainset)
72
- predictions = self.algo.test(testset)
73
 
74
- # Calculate Precision@K and Recall@K
75
- k = 10
76
- threshold = 4.0
77
 
78
- user_est_true = defaultdict(list)
79
- for uid, _, true_r, est, _ in predictions:
80
- user_est_true[uid].append((est, true_r))
81
 
82
- precisions = []
83
- recalls = []
84
 
85
- for uid, user_ratings in user_est_true.items():
86
- user_ratings.sort(key=lambda x: x[0], reverse=True)
87
- top_k = user_ratings[:k]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- n_rel = sum(1 for (_, true_r) in user_ratings if true_r >= threshold)
90
- n_rec_k = sum(1 for (est, _) in top_k if est >= threshold)
91
- n_rel_and_rec_k = sum(1 for (est, true_r) in top_k
92
- if true_r >= threshold and est >= threshold)
 
 
 
 
 
93
 
94
- precisions.append(n_rel_and_rec_k / n_rec_k if n_rec_k > 0 else 0)
95
- recalls.append(n_rel_and_rec_k / n_rel if n_rel > 0 else 0)
96
 
97
- return {
98
- 'rmse': np.mean(results['test_rmse']),
99
- 'mae': np.mean(results['test_mae']),
100
- f'precision@{k}': np.mean(precisions),
101
- f'recall@{k}': np.mean(recalls)
102
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  import gradio as gr
 
 
106
 
107
- # Initialize recommender
108
- recommender = MovieRecommender('ratings.csv', 'movies.csv')
 
109
 
110
- def recommend_interface(user_id, n_recommendations):
 
 
 
 
 
 
111
  try:
112
  user_id = int(user_id)
113
- n_recommendations = int(n_recommendations)
114
 
115
- recommendations = recommender.recommend_movies(user_id, n_recommendations)
 
116
 
117
- output = []
118
- for i, rec in enumerate(recommendations, 1):
119
- output.append(f"{i}. {rec['title']} (Predicted: {rec['predicted_rating']})")
120
 
121
- return "\n".join(output)
 
 
 
 
 
 
 
122
  except Exception as e:
123
  return f"Error: {str(e)}"
124
 
125
- # Create interface
126
- demo = gr.Interface(
127
- fn=recommend_interface,
128
  inputs=[
129
- gr.Textbox(label="User ID", placeholder="Enter user ID"),
130
- gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Number of Recommendations")
131
  ],
132
- outputs=gr.Textbox(label="Recommendations", lines=15),
133
  title="MovieLens Recommendation System",
134
- description="Enter a user ID to get personalized movie recommendations"
135
  )
136
 
137
- demo.launch()
 
 
1
  import numpy as np
2
+ import pandas as pd
3
+ from scipy.sparse.linalg import svds
4
+ from scipy.sparse import csr_matrix
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from sklearn.model_selection import train_test_split
7
+ import warnings
8
+ warnings.filterwarnings('ignore')
9
+
10
+ # ============================================================================
11
+ # DATA LOADING & PREPROCESSING
12
+ # ============================================================================
13
+
14
+ def load_movielens_data(ratings_path='ratings.csv', movies_path='movies.csv'):
15
+ """Load and prepare MovieLens data"""
16
+ ratings = pd.read_csv(ratings_path)
17
+ movies = pd.read_csv(movies_path)
18
+ return ratings, movies
19
+
20
+ def create_user_item_matrix(ratings):
21
+ """Create user-item rating matrix"""
22
+ user_item_matrix = ratings.pivot_table(
23
+ index='userId',
24
+ columns='movieId',
25
+ values='rating'
26
+ ).fillna(0)
27
+ return user_item_matrix
28
+
29
+ # ============================================================================
30
+ # COLLABORATIVE FILTERING - USER BASED
31
+ # ============================================================================
32
+
33
+ class UserBasedCF:
34
+ def __init__(self, user_item_matrix):
35
+ self.matrix = user_item_matrix
36
+ self.user_similarity = None
37
 
38
+ def fit(self):
39
+ """Compute user similarity matrix"""
40
+ self.user_similarity = cosine_similarity(self.matrix)
41
+ np.fill_diagonal(self.user_similarity, 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ def predict(self, user_id, k=50):
44
+ """Predict ratings for user"""
45
+ if user_id not in self.matrix.index:
46
+ return pd.Series()
47
 
48
+ user_idx = self.matrix.index.get_loc(user_id)
49
+ similar_users_idx = np.argsort(self.user_similarity[user_idx])[::-1][:k]
 
50
 
51
+ similar_users_ratings = self.matrix.iloc[similar_users_idx]
52
+ weights = self.user_similarity[user_idx][similar_users_idx]
 
53
 
54
+ weighted_sum = (similar_users_ratings.T * weights).sum(axis=1)
55
+ weight_sum = np.abs(weights).sum()
56
 
57
+ predictions = weighted_sum / (weight_sum + 1e-10)
58
+ user_ratings = self.matrix.loc[user_id]
59
+ predictions[user_ratings > 0] = 0
60
+
61
+ return predictions
62
+
63
+ # ============================================================================
64
+ # COLLABORATIVE FILTERING - ITEM BASED
65
+ # ============================================================================
66
+
67
+ class ItemBasedCF:
68
+ def __init__(self, user_item_matrix):
69
+ self.matrix = user_item_matrix
70
+ self.item_similarity = None
71
+
72
+ def fit(self):
73
+ """Compute item similarity matrix"""
74
+ self.item_similarity = cosine_similarity(self.matrix.T)
75
+ np.fill_diagonal(self.item_similarity, 0)
76
+
77
+ def predict(self, user_id, k=50):
78
+ """Predict ratings for user"""
79
+ if user_id not in self.matrix.index:
80
+ return pd.Series()
81
+
82
+ user_ratings = self.matrix.loc[user_id]
83
+ rated_items = user_ratings[user_ratings > 0]
84
+
85
+ predictions = pd.Series(0, index=self.matrix.columns)
86
+
87
+ for item_id in rated_items.index:
88
+ item_idx = self.matrix.columns.get_loc(item_id)
89
+ similar_items_idx = np.argsort(self.item_similarity[item_idx])[::-1][:k]
90
+
91
+ for sim_idx in similar_items_idx:
92
+ sim_item_id = self.matrix.columns[sim_idx]
93
+ if user_ratings[sim_item_id] == 0:
94
+ predictions[sim_item_id] += (
95
+ self.item_similarity[item_idx][sim_idx] * rated_items[item_id]
96
+ )
97
+
98
+ predictions[user_ratings > 0] = 0
99
+ return predictions
100
+
101
+ # ============================================================================
102
+ # MATRIX FACTORIZATION - SVD
103
+ # ============================================================================
104
+
105
+ class SVDRecommender:
106
+ def __init__(self, user_item_matrix, n_factors=50):
107
+ self.matrix = user_item_matrix
108
+ self.n_factors = n_factors
109
+ self.user_factors = None
110
+ self.item_factors = None
111
+ self.mean_rating = None
112
+
113
+ def fit(self):
114
+ """Perform SVD decomposition"""
115
+ matrix_centered = self.matrix.values - self.matrix.values.mean()
116
+ U, sigma, Vt = svds(matrix_centered, k=self.n_factors)
117
+
118
+ self.user_factors = U
119
+ self.item_factors = Vt.T
120
+ self.sigma = np.diag(sigma)
121
+ self.mean_rating = self.matrix.values.mean()
122
+
123
+ predicted = np.dot(np.dot(U, self.sigma), Vt) + self.mean_rating
124
+ self.predictions = pd.DataFrame(
125
+ predicted,
126
+ index=self.matrix.index,
127
+ columns=self.matrix.columns
128
+ )
129
+
130
+ def predict(self, user_id):
131
+ """Get predictions for user"""
132
+ if user_id not in self.predictions.index:
133
+ return pd.Series()
134
+
135
+ user_predictions = self.predictions.loc[user_id]
136
+ user_ratings = self.matrix.loc[user_id]
137
+ user_predictions[user_ratings > 0] = 0
138
+
139
+ return user_predictions
140
+
141
+ # ============================================================================
142
+ # EVALUATION METRICS
143
+ # ============================================================================
144
+
145
+ def precision_at_k(recommended, relevant, k):
146
+ """Calculate Precision@K"""
147
+ recommended_k = set(recommended[:k])
148
+ relevant_set = set(relevant)
149
+ return len(recommended_k & relevant_set) / k if k > 0 else 0
150
+
151
+ def recall_at_k(recommended, relevant, k):
152
+ """Calculate Recall@K"""
153
+ recommended_k = set(recommended[:k])
154
+ relevant_set = set(relevant)
155
+ return len(recommended_k & relevant_set) / len(relevant_set) if len(relevant_set) > 0 else 0
156
+
157
+ def ndcg_at_k(recommended, relevant, k):
158
+ """Calculate NDCG@K"""
159
+ dcg = 0
160
+ for i, item in enumerate(recommended[:k]):
161
+ if item in relevant:
162
+ dcg += 1 / np.log2(i + 2)
163
+
164
+ idcg = sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])
165
+ return dcg / idcg if idcg > 0 else 0
166
+
167
+ def evaluate_model(model, test_data, user_item_matrix, k=10, threshold=4.0):
168
+ """Evaluate model on test set"""
169
+ precisions, recalls, ndcgs = [], [], []
170
+
171
+ test_users = test_data['userId'].unique()[:100] # Sample for speed
172
+
173
+ for user_id in test_users:
174
+ if user_id not in user_item_matrix.index:
175
+ continue
176
 
177
+ user_test = test_data[test_data['userId'] == user_id]
178
+ relevant_items = user_test[user_test['rating'] >= threshold]['movieId'].tolist()
179
+
180
+ if len(relevant_items) == 0:
181
+ continue
182
+
183
+ predictions = model.predict(user_id)
184
+ if len(predictions) == 0:
185
+ continue
186
 
187
+ recommended = predictions.sort_values(ascending=False).index[:k].tolist()
 
188
 
189
+ precisions.append(precision_at_k(recommended, relevant_items, k))
190
+ recalls.append(recall_at_k(recommended, relevant_items, k))
191
+ ndcgs.append(ndcg_at_k(recommended, relevant_items, k))
192
+
193
+ return {
194
+ 'Precision@K': np.mean(precisions),
195
+ 'Recall@K': np.mean(recalls),
196
+ 'NDCG@K': np.mean(ndcgs)
197
+ }
198
+
199
+ # ============================================================================
200
+ # RECOMMENDATION FUNCTION
201
+ # ============================================================================
202
+
203
+ def recommend_movies(user_id, N, model, movies_df):
204
+ """
205
+ Recommend top N movies for user
206
+
207
+ Parameters:
208
+ - user_id: target user ID
209
+ - N: number of recommendations
210
+ - model: trained recommendation model
211
+ - movies_df: movies dataframe with titles
212
+
213
+ Returns:
214
+ - DataFrame with movie recommendations
215
+ """
216
+ predictions = model.predict(user_id)
217
+
218
+ if len(predictions) == 0:
219
+ return pd.DataFrame(columns=['movieId', 'title', 'predicted_rating'])
220
+
221
+ top_n = predictions.sort_values(ascending=False).head(N)
222
+ recommendations = pd.DataFrame({
223
+ 'movieId': top_n.index,
224
+ 'predicted_rating': top_n.values
225
+ })
226
+
227
+ recommendations = recommendations.merge(movies_df[['movieId', 'title']], on='movieId')
228
+ return recommendations[['movieId', 'title', 'predicted_rating']]
229
 
230
+ # ============================================================================
231
+ # MAIN EXECUTION PIPELINE
232
+ # ============================================================================
233
+
234
+ def main():
235
+ print("Loading data...")
236
+ ratings, movies = load_movielens_data()
237
+
238
+ # Train-test split
239
+ train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
240
+
241
+ print("Creating user-item matrix...")
242
+ user_item_matrix = create_user_item_matrix(train_data)
243
+
244
+ # Train models
245
+ print("\n1. Training User-Based CF...")
246
+ user_cf = UserBasedCF(user_item_matrix)
247
+ user_cf.fit()
248
+ metrics_user_cf = evaluate_model(user_cf, test_data, user_item_matrix)
249
+ print(f"User-Based CF Metrics: {metrics_user_cf}")
250
+
251
+ print("\n2. Training Item-Based CF...")
252
+ item_cf = ItemBasedCF(user_item_matrix)
253
+ item_cf.fit()
254
+ metrics_item_cf = evaluate_model(item_cf, test_data, user_item_matrix)
255
+ print(f"Item-Based CF Metrics: {metrics_item_cf}")
256
+
257
+ print("\n3. Training SVD...")
258
+ svd = SVDRecommender(user_item_matrix, n_factors=50)
259
+ svd.fit()
260
+ metrics_svd = evaluate_model(svd, test_data, user_item_matrix)
261
+ print(f"SVD Metrics: {metrics_svd}")
262
+
263
+ # Compare models
264
+ print("\n" + "="*60)
265
+ print("MODEL COMPARISON")
266
+ print("="*60)
267
+ comparison = pd.DataFrame({
268
+ 'User-Based CF': metrics_user_cf,
269
+ 'Item-Based CF': metrics_item_cf,
270
+ 'SVD': metrics_svd
271
+ })
272
+ print(comparison)
273
+
274
+ # Select best model (based on NDCG)
275
+ best_model_name = comparison.loc['NDCG@K'].idxmax()
276
+ print(f"\nBest Model: {best_model_name}")
277
+
278
+ if best_model_name == 'User-Based CF':
279
+ best_model = user_cf
280
+ elif best_model_name == 'Item-Based CF':
281
+ best_model = item_cf
282
+ else:
283
+ best_model = svd
284
+
285
+ # Example recommendation
286
+ print("\n" + "="*60)
287
+ print("EXAMPLE RECOMMENDATIONS")
288
+ print("="*60)
289
+ sample_user = user_item_matrix.index[0]
290
+ recommendations = recommend_movies(sample_user, 10, best_model, movies)
291
+ print(f"\nTop 10 recommendations for User {sample_user}:")
292
+ print(recommendations.to_string(index=False))
293
+
294
+ return best_model, user_item_matrix, movies
295
+
296
+ if __name__ == "__main__":
297
+ best_model, user_item_matrix, movies = main()
298
+
299
+ # save_model.py
300
+ import pickle
301
+ import os
302
+
303
+ def save_recommendation_system(model, user_item_matrix, movies, output_dir='recommendation_model'):
304
+ """Save trained model and data"""
305
+ os.makedirs(output_dir, exist_ok=True)
306
+
307
+ with open(f'{output_dir}/model.pkl', 'wb') as f:
308
+ pickle.dump(model, f)
309
+
310
+ with open(f'{output_dir}/user_item_matrix.pkl', 'wb') as f:
311
+ pickle.dump(user_item_matrix, f)
312
+
313
+ movies.to_csv(f'{output_dir}/movies.csv', index=False)
314
+
315
+ print(f"Model saved to {output_dir}/")
316
+
317
+ # Save after training
318
+ save_recommendation_system(best_model, user_item_matrix, movies)
319
 
320
  import gradio as gr
321
+ import pickle
322
+ import pandas as pd
323
 
324
+ # Load model
325
+ with open('model.pkl', 'rb') as f:
326
+ model = pickle.load(f)
327
 
328
+ with open('user_item_matrix.pkl', 'rb') as f:
329
+ user_item_matrix = pickle.load(f)
330
+
331
+ movies = pd.read_csv('movies.csv')
332
+
333
+ def recommend_movies(user_id, N):
334
+ """Recommendation function for Gradio"""
335
  try:
336
  user_id = int(user_id)
337
+ N = int(N)
338
 
339
+ if user_id not in user_item_matrix.index:
340
+ return "User ID not found"
341
 
342
+ predictions = model.predict(user_id)
343
+ top_n = predictions.sort_values(ascending=False).head(N)
 
344
 
345
+ recommendations = pd.DataFrame({
346
+ 'movieId': top_n.index,
347
+ 'predicted_rating': top_n.values
348
+ })
349
+
350
+ recommendations = recommendations.merge(movies[['movieId', 'title']], on='movieId')
351
+ return recommendations[['title', 'predicted_rating']]
352
+
353
  except Exception as e:
354
  return f"Error: {str(e)}"
355
 
356
+ interface = gr.Interface(
357
+ fn=recommend_movies,
 
358
  inputs=[
359
+ gr.Number(label="User ID"),
360
+ gr.Number(label="Number of Recommendations", value=10)
361
  ],
362
+ outputs=gr.Dataframe(label="Recommended Movies"),
363
  title="MovieLens Recommendation System",
364
+ description="Enter User ID and number of recommendations"
365
  )
366
 
367
+ interface.launch()