LvMAC commited on
Commit
6caf537
·
verified ·
1 Parent(s): da7700d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -476
app.py CHANGED
@@ -1,476 +1,108 @@
1
- import pandas as pd
2
- import numpy as np
3
- from surprise import SVD, SVDpp, NMF, KNNBasic, Dataset, Reader
4
- from surprise.model_selection import train_test_split, GridSearchCV
5
- from collections import defaultdict
6
- import gradio as gr
7
- import pickle
8
- import os
9
-
10
- class MovieRecommenderEnsemble:
11
- def __init__(self, ratings_path, movies_path):
12
- print("Loading data...")
13
- self.ratings = pd.read_csv(ratings_path)
14
- self.movies = pd.read_csv(movies_path)
15
-
16
- # Prepare Surprise dataset
17
- reader = Reader(rating_scale=(0.5, 5.0))
18
- self.data = Dataset.load_from_df(
19
- self.ratings[['userId', 'movieId', 'rating']],
20
- reader
21
- )
22
-
23
- # Train-test split for evaluation
24
- self.trainset, self.testset = train_test_split(self.data, test_size=0.2)
25
-
26
- # Initialize models
27
- self.models = {}
28
- self.train_all_models()
29
-
30
- def train_all_models(self):
31
- """Train all models with optimal hyperparameters for MovieLens 1M"""
32
-
33
- print("\n" + "="*50)
34
- print("Training User-Based Collaborative Filtering...")
35
- print("="*50)
36
-
37
- # User-Based CF - Optimal for 1M dataset
38
- user_based_options = {
39
- 'name': 'cosine',
40
- 'user_based': True,
41
- 'min_support': 5
42
- }
43
- self.models['user_based_cf'] = KNNBasic(
44
- k=50,
45
- sim_options=user_based_options
46
- )
47
- self.models['user_based_cf'].fit(self.trainset)
48
- print("✓ User-Based CF trained")
49
-
50
- print("\n" + "="*50)
51
- print("Training Item-Based Collaborative Filtering...")
52
- print("="*50)
53
-
54
- # Item-Based CF - Optimal for 1M dataset
55
- item_based_options = {
56
- 'name': 'cosine',
57
- 'user_based': False,
58
- 'min_support': 5
59
- }
60
- self.models['item_based_cf'] = KNNBasic(
61
- k=40,
62
- sim_options=item_based_options
63
- )
64
- self.models['item_based_cf'].fit(self.trainset)
65
- print("✓ Item-Based CF trained")
66
-
67
- print("\n" + "="*50)
68
- print("Training SVD (Matrix Factorization)...")
69
- print("="*50)
70
-
71
- # SVD - Tuned for 1M dataset
72
- self.models['svd'] = SVD(
73
- n_factors=150,
74
- n_epochs=30,
75
- lr_all=0.007,
76
- reg_all=0.05,
77
- random_state=42,
78
- verbose=True
79
- )
80
- self.models['svd'].fit(self.trainset)
81
- print("✓ SVD trained")
82
-
83
- print("\n" + "="*50)
84
- print("Training SVD++ (Enhanced Matrix Factorization)...")
85
- print("="*50)
86
-
87
- # SVD++ - Includes implicit feedback
88
- self.models['svdpp'] = SVDpp(
89
- n_factors=100,
90
- n_epochs=20,
91
- lr_all=0.007,
92
- reg_all=0.05,
93
- random_state=42,
94
- verbose=True
95
- )
96
- self.models['svdpp'].fit(self.trainset)
97
- print("✓ SVD++ trained")
98
-
99
- print("\n" + "="*50)
100
- print("Training NMF (Non-negative Matrix Factorization)...")
101
- print("="*50)
102
-
103
- # NMF - Alternative factorization
104
- self.models['nmf'] = NMF(
105
- n_factors=50,
106
- n_epochs=50,
107
- random_state=42,
108
- verbose=True
109
- )
110
- self.models['nmf'].fit(self.trainset)
111
- print("✓ NMF trained")
112
-
113
- print("\n" + "="*50)
114
- print("All models trained successfully!")
115
- print("="*50)
116
-
117
- def evaluate_models(self):
118
- """Evaluate all models on test set"""
119
- print("\n" + "="*50)
120
- print("EVALUATING ALL MODELS")
121
- print("="*50)
122
-
123
- results = {}
124
-
125
- for name, model in self.models.items():
126
- print(f"\nEvaluating {name.upper()}...")
127
-
128
- # Get predictions
129
- predictions = model.test(self.testset)
130
-
131
- # Calculate RMSE and MAE
132
- rmse = self.calculate_rmse(predictions)
133
- mae = self.calculate_mae(predictions)
134
-
135
- # Calculate Precision@10, Recall@10, NDCG@10
136
- precision, recall, ndcg = self.calculate_ranking_metrics(predictions, k=10)
137
-
138
- results[name] = {
139
- 'RMSE': rmse,
140
- 'MAE': mae,
141
- 'Precision@10': precision,
142
- 'Recall@10': recall,
143
- 'NDCG@10': ndcg
144
- }
145
-
146
- print(f" RMSE: {rmse:.4f}")
147
- print(f" MAE: {mae:.4f}")
148
- print(f" Precision@10: {precision:.4f}")
149
- print(f" Recall@10: {recall:.4f}")
150
- print(f" NDCG@10: {ndcg:.4f}")
151
-
152
- # Determine best model
153
- best_model = max(results.items(), key=lambda x: x[1]['Precision@10'])
154
- print(f"\n{'='*50}")
155
- print(f"BEST MODEL: {best_model[0].upper()}")
156
- print(f"Precision@10: {best_model[1]['Precision@10']:.4f}")
157
- print(f"{'='*50}\n")
158
-
159
- return results, best_model[0]
160
-
161
- def calculate_rmse(self, predictions):
162
- """Calculate Root Mean Square Error"""
163
- mse = np.mean([(pred.est - pred.r_ui)**2 for pred in predictions])
164
- return np.sqrt(mse)
165
-
166
- def calculate_mae(self, predictions):
167
- """Calculate Mean Absolute Error"""
168
- return np.mean([abs(pred.est - pred.r_ui) for pred in predictions])
169
-
170
- def calculate_ranking_metrics(self, predictions, k=10, threshold=4.0):
171
- """Calculate Precision@K, Recall@K, and NDCG@K"""
172
-
173
- # Organize predictions by user
174
- user_est_true = defaultdict(list)
175
- for uid, _, true_r, est, _ in predictions:
176
- user_est_true[uid].append((est, true_r))
177
-
178
- precisions = []
179
- recalls = []
180
- ndcgs = []
181
-
182
- for uid, user_ratings in user_est_true.items():
183
- # Sort by estimated rating
184
- user_ratings.sort(key=lambda x: x[0], reverse=True)
185
-
186
- # Top k predictions
187
- top_k = user_ratings[:k]
188
-
189
- # Calculate metrics
190
- n_rel = sum(1 for (_, true_r) in user_ratings if true_r >= threshold)
191
- n_rec_k = sum(1 for (est, _) in top_k if est >= threshold)
192
- n_rel_and_rec_k = sum(1 for (est, true_r) in top_k
193
- if true_r >= threshold and est >= threshold)
194
-
195
- # Precision@K
196
- precision = n_rel_and_rec_k / k if k > 0 else 0
197
- precisions.append(precision)
198
-
199
- # Recall@K
200
- recall = n_rel_and_rec_k / n_rel if n_rel > 0 else 0
201
- recalls.append(recall)
202
-
203
- # NDCG@K
204
- dcg = sum((2**true_r - 1) / np.log2(i + 2)
205
- for i, (est, true_r) in enumerate(top_k) if true_r >= threshold)
206
- ideal_ratings = sorted([true_r for _, true_r in user_ratings], reverse=True)[:k]
207
- idcg = sum((2**true_r - 1) / np.log2(i + 2)
208
- for i, true_r in enumerate(ideal_ratings) if true_r >= threshold)
209
- ndcg = dcg / idcg if idcg > 0 else 0
210
- ndcgs.append(ndcg)
211
-
212
- return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)
213
-
214
- def recommend_movies(self, user_id, N, model_name='svd'):
215
- """
216
- Recommend top N movies for a user using specified model
217
-
218
- Args:
219
- user_id: User ID
220
- N: Number of recommendations
221
- model_name: 'user_based_cf', 'item_based_cf', 'svd', 'svdpp', 'nmf', or 'ensemble'
222
- """
223
-
224
- if model_name == 'ensemble':
225
- return self.recommend_ensemble(user_id, N)
226
-
227
- if model_name not in self.models:
228
- return f"Model '{model_name}' not found. Available: {list(self.models.keys())}"
229
-
230
- model = self.models[model_name]
231
-
232
- # Get all movies
233
- all_movies = self.movies['movieId'].unique()
234
-
235
- # Get movies user has rated
236
- rated_movies = self.ratings[self.ratings['userId'] == user_id]['movieId'].values
237
-
238
- # Get unrated movies
239
- unrated_movies = [m for m in all_movies if m not in rated_movies]
240
-
241
- # Predict ratings
242
- predictions = []
243
- for movie_id in unrated_movies:
244
- pred = model.predict(user_id, movie_id)
245
- predictions.append((movie_id, pred.est))
246
-
247
- # Sort by predicted rating
248
- predictions.sort(key=lambda x: x[1], reverse=True)
249
-
250
- # Get top N
251
- top_n = predictions[:N]
252
-
253
- # Format results
254
- results = []
255
- for i, (movie_id, score) in enumerate(top_n, 1):
256
- movie_info = self.movies[self.movies['movieId'] == movie_id]
257
- if len(movie_info) > 0:
258
- title = movie_info['title'].iloc[0]
259
- genres = movie_info['genres'].iloc[0] if 'genres' in movie_info else 'N/A'
260
- results.append({
261
- 'rank': i,
262
- 'movieId': int(movie_id),
263
- 'title': title,
264
- 'genres': genres,
265
- 'predicted_rating': round(score, 2)
266
- })
267
-
268
- return results
269
-
270
- def recommend_ensemble(self, user_id, N):
271
- """Ensemble recommendation using weighted average of all models"""
272
-
273
- # Get all movies
274
- all_movies = self.movies['movieId'].unique()
275
- rated_movies = self.ratings[self.ratings['userId'] == user_id]['movieId'].values
276
- unrated_movies = [m for m in all_movies if m not in rated_movies]
277
-
278
- # Model weights (based on typical performance)
279
- weights = {
280
- 'user_based_cf': 0.20,
281
- 'item_based_cf': 0.20,
282
- 'svd': 0.25,
283
- 'svdpp': 0.25,
284
- 'nmf': 0.10
285
- }
286
-
287
- # Aggregate predictions
288
- movie_scores = defaultdict(float)
289
-
290
- for movie_id in unrated_movies:
291
- weighted_sum = 0
292
- for model_name, model in self.models.items():
293
- pred = model.predict(user_id, movie_id).est
294
- weighted_sum += pred * weights[model_name]
295
- movie_scores[movie_id] = weighted_sum
296
-
297
- # Sort and get top N
298
- sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)[:N]
299
-
300
- # Format results
301
- results = []
302
- for i, (movie_id, score) in enumerate(sorted_movies, 1):
303
- movie_info = self.movies[self.movies['movieId'] == movie_id]
304
- if len(movie_info) > 0:
305
- title = movie_info['title'].iloc[0]
306
- genres = movie_info['genres'].iloc[0] if 'genres' in movie_info else 'N/A'
307
- results.append({
308
- 'rank': i,
309
- 'movieId': int(movie_id),
310
- 'title': title,
311
- 'genres': genres,
312
- 'predicted_rating': round(score, 2)
313
- })
314
-
315
- return results
316
-
317
- # Initialize recommender system
318
- print("Initializing MovieLens Recommendation System...")
319
- recommender = MovieRecommenderEnsemble('ratings.csv', 'movies.csv')
320
-
321
- # Evaluate all models
322
- evaluation_results, best_model_name = recommender.evaluate_models()
323
-
324
- # Create Gradio interface
325
- def recommend_interface(user_id, n_recommendations, model_choice):
326
- try:
327
- user_id = int(user_id)
328
- n_recommendations = int(n_recommendations)
329
-
330
- # Map display names to internal names
331
- model_map = {
332
- 'User-Based CF': 'user_based_cf',
333
- 'Item-Based CF': 'item_based_cf',
334
- 'SVD': 'svd',
335
- 'SVD++': 'svdpp',
336
- 'NMF': 'nmf',
337
- 'Ensemble (All Models)': 'ensemble'
338
- }
339
-
340
- model_name = model_map.get(model_choice, 'svd')
341
-
342
- recommendations = recommender.recommend_movies(user_id, n_recommendations, model_name)
343
-
344
- if isinstance(recommendations, str):
345
- return recommendations
346
-
347
- # Format output
348
- output = f"Top {n_recommendations} recommendations for User {user_id} using {model_choice}:\n\n"
349
- for rec in recommendations:
350
- output += f"{rec['rank']}. {rec['title']}\n"
351
- output += f" Genres: {rec['genres']}\n"
352
- output += f" Predicted Rating: {rec['predicted_rating']}/5.0\n\n"
353
-
354
- return output
355
-
356
- except ValueError:
357
- return "Error: Please enter a valid user ID"
358
- except Exception as e:
359
- return f"Error: {str(e)}"
360
-
361
- def show_evaluation():
362
- """Display evaluation results"""
363
- output = "MODEL EVALUATION RESULTS\n"
364
- output += "="*60 + "\n\n"
365
-
366
- for model_name, metrics in evaluation_results.items():
367
- output += f"{model_name.upper().replace('_', ' ')}\n"
368
- output += "-"*40 + "\n"
369
- for metric, value in metrics.items():
370
- output += f" {metric}: {value:.4f}\n"
371
- output += "\n"
372
-
373
- output += "="*60 + "\n"
374
- output += f"BEST MODEL: {best_model_name.upper().replace('_', ' ')}\n"
375
- output += "="*60
376
-
377
- return output
378
-
379
- # Create Gradio interface
380
- with gr.Blocks(title="MovieLens Recommendation System") as demo:
381
- gr.Markdown("# 🎬 MovieLens Recommendation System")
382
- gr.Markdown("### Trained on MovieLens 1M Dataset (6,040 users, 3,706 movies)")
383
-
384
- with gr.Tab("Get Recommendations"):
385
- with gr.Row():
386
- with gr.Column():
387
- user_input = gr.Textbox(
388
- label="User ID",
389
- placeholder="Enter user ID (1-6040)",
390
- value="1"
391
- )
392
- n_input = gr.Slider(
393
- minimum=1,
394
- maximum=20,
395
- value=10,
396
- step=1,
397
- label="Number of Recommendations"
398
- )
399
- model_input = gr.Dropdown(
400
- choices=[
401
- 'User-Based CF',
402
- 'Item-Based CF',
403
- 'SVD',
404
- 'SVD++',
405
- 'NMF',
406
- 'Ensemble (All Models)'
407
- ],
408
- value='SVD',
409
- label="Select Model"
410
- )
411
- recommend_btn = gr.Button("Get Recommendations", variant="primary")
412
-
413
- with gr.Column():
414
- output = gr.Textbox(
415
- label="Recommendations",
416
- lines=20,
417
- max_lines=30
418
- )
419
-
420
- recommend_btn.click(
421
- fn=recommend_interface,
422
- inputs=[user_input, n_input, model_input],
423
- outputs=output
424
- )
425
-
426
- with gr.Tab("Model Evaluation"):
427
- gr.Markdown("## Performance Comparison of All Models")
428
- eval_output = gr.Textbox(
429
- label="Evaluation Metrics",
430
- lines=25,
431
- value=show_evaluation()
432
- )
433
-
434
- with gr.Tab("About"):
435
- gr.Markdown("""
436
- ## About This System
437
-
438
- This recommendation system implements multiple collaborative filtering approaches:
439
-
440
- ### Models Implemented:
441
-
442
- 1. **User-Based Collaborative Filtering**
443
- - Finds similar users based on rating patterns
444
- - k=50 neighbors, cosine similarity
445
-
446
- 2. **Item-Based Collaborative Filtering**
447
- - Recommends items similar to those you liked
448
- - k=40 neighbors, cosine similarity
449
-
450
- 3. **SVD (Singular Value Decomposition)**
451
- - Matrix factorization with 150 latent factors
452
- - 30 epochs, optimized for MovieLens 1M
453
-
454
- 4. **SVD++ (Enhanced SVD)**
455
- - Includes implicit feedback signals
456
- - 100 factors, 20 epochs
457
-
458
- 5. **NMF (Non-negative Matrix Factorization)**
459
- - Alternative factorization method
460
- - 50 factors, 50 epochs
461
-
462
- 6. **Ensemble**
463
- - Weighted combination of all models
464
- - Leverages strengths of each approach
465
-
466
- ### Evaluation Metrics:
467
- - **RMSE/MAE**: Prediction accuracy
468
- - **Precision@10**: Relevance of top 10 recommendations
469
- - **Recall@10**: Coverage of relevant items
470
- - **NDCG@10**: Ranking quality
471
-
472
- ### Dataset:
473
- MovieLens 1M - 1 million ratings from 6,040 users on 3,706 movies
474
- """)
475
-
476
- demo.launch()
 
1
+
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import pickle
5
+ import numpy as np
6
+ from collections import defaultdict
7
+
8
+ with open('best_svd.pkl', 'rb') as f:
9
+ best_svd = pickle.load(f)
10
+ with open('best_nmf.pkl', 'rb') as f:
11
+ best_nmf = pickle.load(f)
12
+ with open('model_metadata.pkl', 'rb') as f:
13
+ metadata = pickle.load(f)
14
+
15
+ movies = metadata['movies_df']
16
+ ratings_filtered = metadata['ratings_filtered_df']
17
+ popular_movies = metadata['popular_movies']
18
+
19
+ def recommend_movies_gradio(user_id, model_choice, n_recommendations):
20
+ try:
21
+ user_id = int(user_id)
22
+ n_recommendations = int(n_recommendations)
23
+ except:
24
+ return "Error: Please enter valid numbers for User ID and N"
25
+
26
+ if user_id not in ratings_filtered['userId'].values:
27
+ popular_recs = popular_movies.head(n_recommendations).merge(
28
+ movies[['movieId', 'title_clean', 'year', 'genres']],
29
+ on='movieId'
30
+ )
31
+ result = popular_recs[['title_clean', 'year', 'genres', 'weighted_rating']].rename(
32
+ columns={'title_clean': 'Title', 'year': 'Year', 'genres': 'Genres', 'weighted_rating': 'Score'}
33
+ )
34
+ return f"User {user_id} not found. Showing popular movies:\n\n" + result.to_string(index=False)
35
+
36
+ user_ratings = ratings_filtered[ratings_filtered['userId'] == user_id]['movieId'].values
37
+ all_movies = ratings_filtered['movieId'].unique()
38
+ unseen_movies = [m for m in all_movies if m not in user_ratings]
39
+
40
+ if model_choice == "Ensemble (SVD + NMF)":
41
+ models = [best_svd, best_nmf]
42
+ ensemble_predictions = defaultdict(list)
43
+
44
+ for model in models:
45
+ for movie_id in unseen_movies:
46
+ pred = model.predict(user_id, movie_id)
47
+ ensemble_predictions[movie_id].append(pred.est)
48
+
49
+ predictions = []
50
+ for movie_id, preds in ensemble_predictions.items():
51
+ predictions.append({
52
+ 'movieId': movie_id,
53
+ 'score': np.mean(preds)
54
+ })
55
+ else:
56
+ if model_choice == "SVD":
57
+ model = best_svd
58
+ else:
59
+ model = best_nmf
60
+
61
+ predictions = []
62
+ for movie_id in unseen_movies:
63
+ pred = model.predict(user_id, movie_id)
64
+ predictions.append({
65
+ 'movieId': movie_id,
66
+ 'score': pred.est
67
+ })
68
+
69
+ predictions_df = pd.DataFrame(predictions)
70
+ top_n = predictions_df.nlargest(n_recommendations, 'score')
71
+
72
+ top_n = top_n.merge(movies[['movieId', 'title_clean', 'year', 'genres']], on='movieId')
73
+ result = top_n[['title_clean', 'year', 'genres', 'score']].rename(
74
+ columns={'title_clean': 'Title', 'year': 'Year', 'genres': 'Genres', 'score': 'Predicted Rating'}
75
+ )
76
+
77
+ return result.to_string(index=False)
78
+
79
+ iface = gr.Interface(
80
+ fn=recommend_movies_gradio,
81
+ inputs=[
82
+ gr.Textbox(label="User ID", placeholder="Enter user ID (e.g., 1, 100, 500)"),
83
+ gr.Dropdown(
84
+ choices=["Ensemble (SVD + NMF)", "SVD", "NMF"],
85
+ label="Model Selection",
86
+ value="Ensemble (SVD + NMF)"
87
+ ),
88
+ gr.Slider(minimum=5, maximum=50, value=10, step=5, label="Number of Recommendations")
89
+ ],
90
+ outputs=gr.Textbox(label="Recommendations", lines=20),
91
+ title="<� Movie Recommendation System - MovieLens",
92
+ description="""
93
+ Get personalized movie recommendations based on user preferences.
94
+
95
+ **Models:**
96
+ - **Ensemble**: Combines SVD and NMF for robust predictions
97
+ - **SVD**: Matrix factorization with latent factors
98
+ - **NMF**: Non-negative matrix factorization
99
+ """,
100
+ examples=[
101
+ ["1", "Ensemble (SVD + NMF)", 10],
102
+ ["100", "SVD", 15],
103
+ ["500", "NMF", 20]
104
+ ]
105
+ )
106
+
107
+ if __name__ == "__main__":
108
+ iface.launch()