LvMAC commited on
Commit
6a8179a
·
verified ·
1 Parent(s): 863f720

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +418 -667
app.py CHANGED
@@ -1,725 +1,476 @@
1
- # ============================================================================
2
- # MOVIELENS RECOMMENDATION SYSTEM - PURE IMPLEMENTATION
3
- # ============================================================================
4
-
5
- import numpy as np
6
  import pandas as pd
7
- from scipy.sparse.linalg import svds
8
- from sklearn.metrics.pairwise import cosine_similarity
9
- from sklearn.model_selection import train_test_split
 
 
10
  import pickle
11
  import os
12
- import warnings
13
- warnings.filterwarnings('ignore')
14
-
15
- # ============================================================================
16
- # DATA LOADING & PREPROCESSING
17
- # ============================================================================
18
-
19
- def load_movielens_data(ratings_path='ratings.csv', movies_path='movies.csv'):
20
- """Load MovieLens data"""
21
- ratings = pd.read_csv(ratings_path)
22
- movies = pd.read_csv(movies_path)
23
-
24
- print(f"Loaded {len(ratings)} ratings")
25
- print(f"Loaded {len(movies)} movies")
26
- print(f"Users: {ratings['userId'].nunique()}")
27
- print(f"Rating distribution:\n{ratings['rating'].value_counts().sort_index()}")
28
- print(f"Mean rating: {ratings['rating'].mean():.3f}")
29
- print(f"Median rating: {ratings['rating'].median():.3f}")
30
-
31
- return ratings, movies
32
-
33
- def create_user_item_matrix(ratings):
34
- """Create user-item rating matrix"""
35
- user_item_matrix = ratings.pivot_table(
36
- index='userId',
37
- columns='movieId',
38
- values='rating'
39
- ).fillna(0)
40
-
41
- sparsity = 100 * (1 - (user_item_matrix > 0).sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]))
42
- print(f"Matrix shape: {user_item_matrix.shape}")
43
- print(f"Sparsity: {sparsity:.2f}%")
44
-
45
- return user_item_matrix
46
 
47
- # ============================================================================
48
- # USER-BASED COLLABORATIVE FILTERING
49
- # ============================================================================
50
-
51
- class UserBasedCF:
52
- """User-based collaborative filtering using cosine similarity"""
53
-
54
- def __init__(self, user_item_matrix):
55
- self.matrix = user_item_matrix
56
- self.user_similarity = None
57
-
58
- def fit(self):
59
- """Compute user-user similarity matrix"""
60
- print("Computing user similarity matrix...")
61
- self.user_similarity = cosine_similarity(self.matrix)
62
- np.fill_diagonal(self.user_similarity, 0)
63
- print("User similarity matrix computed")
64
-
65
- def predict(self, user_id, k=50):
66
- """Predict ratings for a user based on similar users"""
67
- if user_id not in self.matrix.index:
68
- return pd.Series(dtype=float)
69
-
70
- user_idx = self.matrix.index.get_loc(user_id)
71
- user_similarities = self.user_similarity[user_idx]
72
-
73
- # Get top-k similar users
74
- top_k_indices = np.argsort(user_similarities)[::-1][:k]
75
- top_k_similarities = user_similarities[top_k_indices]
76
-
77
- # Filter out negative similarities
78
- positive_mask = top_k_similarities > 0
79
- top_k_indices = top_k_indices[positive_mask]
80
- top_k_similarities = top_k_similarities[positive_mask]
81
-
82
- if len(top_k_indices) == 0:
83
- return pd.Series(0, index=self.matrix.columns, dtype=float)
84
-
85
- # Get ratings from similar users
86
- similar_users_ratings = self.matrix.iloc[top_k_indices]
87
-
88
- # Weighted sum of ratings
89
- weighted_ratings = similar_users_ratings.T.dot(top_k_similarities)
90
- sum_of_weights = np.sum(top_k_similarities)
91
 
92
- # Calculate predicted ratings
93
- predicted_ratings = weighted_ratings / (sum_of_weights + 1e-10)
94
 
95
- # Exclude already rated items
96
- user_ratings = self.matrix.loc[user_id]
97
- predicted_ratings[user_ratings > 0] = 0
98
 
99
- return predicted_ratings
100
-
101
- # ============================================================================
102
- # ITEM-BASED COLLABORATIVE FILTERING
103
- # ============================================================================
104
-
105
- class ItemBasedCF:
106
- """Item-based collaborative filtering using cosine similarity"""
107
-
108
- def __init__(self, user_item_matrix):
109
- self.matrix = user_item_matrix
110
- self.item_similarity = None
111
 
112
- def fit(self):
113
- """Compute item-item similarity matrix"""
114
- print("Computing item similarity matrix...")
115
- self.item_similarity = cosine_similarity(self.matrix.T)
116
- np.fill_diagonal(self.item_similarity, 0)
117
- print("Item similarity matrix computed")
118
 
119
- def predict(self, user_id, k=50):
120
- """Predict ratings for a user based on similar items"""
121
- if user_id not in self.matrix.index:
122
- return pd.Series(dtype=float)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- user_ratings = self.matrix.loc[user_id]
125
- rated_items = user_ratings[user_ratings > 0]
 
126
 
127
- if len(rated_items) == 0:
128
- return pd.Series(0, index=self.matrix.columns, dtype=float)
 
 
 
129
 
130
- predicted_ratings = pd.Series(0.0, index=self.matrix.columns)
131
 
132
- for item_id, rating in rated_items.items():
133
- item_idx = self.matrix.columns.get_loc(item_id)
134
- item_similarities = self.item_similarity[item_idx]
135
 
136
- # Get top-k similar items
137
- top_k_indices = np.argsort(item_similarities)[::-1][:k]
138
 
139
- for similar_idx in top_k_indices:
140
- similar_item_id = self.matrix.columns[similar_idx]
141
- similarity = item_similarities[similar_idx]
142
-
143
- if similarity > 0 and user_ratings[similar_item_id] == 0:
144
- predicted_ratings[similar_item_id] += similarity * rating
145
-
146
- # Exclude already rated items
147
- predicted_ratings[user_ratings > 0] = 0
148
-
149
- return predicted_ratings
150
-
151
- # ============================================================================
152
- # SINGULAR VALUE DECOMPOSITION (SVD)
153
- # ============================================================================
154
-
155
- class SVDRecommender:
156
- """Matrix factorization using SVD"""
157
-
158
- def __init__(self, user_item_matrix, n_factors=50):
159
- self.matrix = user_item_matrix
160
- self.n_factors = n_factors
161
- self.predictions = None
162
-
163
- def fit(self):
164
- """Perform SVD decomposition"""
165
- print(f"Performing SVD with {self.n_factors} factors...")
166
-
167
- # Mean center the matrix
168
- matrix_mean = np.mean(self.matrix.values[np.where(self.matrix.values != 0)])
169
- matrix_centered = self.matrix.values.copy()
170
- matrix_centered[matrix_centered != 0] -= matrix_mean
171
-
172
- # Perform SVD
173
- U, sigma, Vt = svds(matrix_centered, k=self.n_factors)
174
- sigma = np.diag(sigma)
175
-
176
- # Reconstruct the matrix
177
- predicted_ratings = np.dot(np.dot(U, sigma), Vt) + matrix_mean
178
-
179
- self.predictions = pd.DataFrame(
180
- predicted_ratings,
181
- index=self.matrix.index,
182
- columns=self.matrix.columns
183
- )
184
-
185
- print("SVD decomposition complete")
186
-
187
- def predict(self, user_id):
188
- """Get predicted ratings for a user"""
189
- if user_id not in self.predictions.index:
190
- return pd.Series(dtype=float)
191
-
192
- user_predictions = self.predictions.loc[user_id].copy()
193
- user_ratings = self.matrix.loc[user_id]
194
-
195
- # Exclude already rated items
196
- user_predictions[user_ratings > 0] = 0
197
-
198
- return user_predictions
199
-
200
- # ============================================================================
201
- # EVALUATION METRICS
202
- # ============================================================================
203
-
204
- def precision_at_k(recommended, relevant, k):
205
- """Precision@K: fraction of recommended items that are relevant"""
206
- recommended_k = set(recommended[:k])
207
- relevant_set = set(relevant)
208
-
209
- if k == 0:
210
- return 0.0
211
-
212
- return len(recommended_k & relevant_set) / k
213
-
214
- def recall_at_k(recommended, relevant, k):
215
- """Recall@K: fraction of relevant items that are recommended"""
216
- recommended_k = set(recommended[:k])
217
- relevant_set = set(relevant)
218
-
219
- if len(relevant_set) == 0:
220
- return 0.0
221
-
222
- return len(recommended_k & relevant_set) / len(relevant_set)
223
-
224
- def ndcg_at_k(recommended, relevant, k):
225
- """NDCG@K: Normalized Discounted Cumulative Gain"""
226
- dcg = 0.0
227
- for i, item in enumerate(recommended[:k]):
228
- if item in relevant:
229
- dcg += 1.0 / np.log2(i + 2)
230
-
231
- idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(relevant), k))])
232
-
233
- if idcg == 0:
234
- return 0.0
235
-
236
- return dcg / idcg
237
-
238
- def evaluate_model(model, test_data, user_item_matrix, k=10, threshold=4.0):
239
- """Evaluate recommendation model"""
240
- precisions = []
241
- recalls = []
242
- ndcgs = []
243
-
244
- test_users = test_data['userId'].unique()
245
-
246
- print(f"Evaluating on {len(test_users)} test users...")
247
-
248
- evaluated_count = 0
249
- for user_id in test_users:
250
- if user_id not in user_item_matrix.index:
251
- continue
252
-
253
- # Get relevant items for this user (rated >= threshold)
254
- user_test_data = test_data[test_data['userId'] == user_id]
255
- relevant_items = user_test_data[user_test_data['rating'] >= threshold]['movieId'].tolist()
256
-
257
- if len(relevant_items) == 0:
258
- continue
259
-
260
- # Get predictions
261
- predictions = model.predict(user_id)
262
-
263
- if len(predictions) == 0 or predictions.sum() == 0:
264
- continue
265
-
266
- # Get top-k recommendations
267
- top_k_items = predictions.nlargest(k).index.tolist()
268
-
269
- # Calculate metrics
270
- precisions.append(precision_at_k(top_k_items, relevant_items, k))
271
- recalls.append(recall_at_k(top_k_items, relevant_items, k))
272
- ndcgs.append(ndcg_at_k(top_k_items, relevant_items, k))
273
 
274
- evaluated_count += 1
 
 
 
 
275
 
276
- if evaluated_count >= 100: # Limit for computational efficiency
277
- break
278
-
279
- print(f"Evaluated {evaluated_count} users")
280
-
281
- if len(precisions) == 0:
282
- return {
283
- 'Precision@K': 0.0,
284
- 'Recall@K': 0.0,
285
- 'NDCG@K': 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  }
287
-
288
- return {
289
- 'Precision@K': np.mean(precisions),
290
- 'Recall@K': np.mean(recalls),
291
- 'NDCG@K': np.mean(ndcgs)
292
- }
293
-
294
- # ============================================================================
295
- # RECOMMENDATION FUNCTION
296
- # ============================================================================
297
-
298
- def recommend_movies(user_id, N, model, movies_df):
299
- """
300
- Recommend top N movies for a user
301
-
302
- Parameters:
303
- - user_id: User ID
304
- - N: Number of recommendations
305
- - model: Trained recommendation model
306
- - movies_df: DataFrame with movie information
307
-
308
- Returns:
309
- - DataFrame with recommended movies
310
- """
311
- predictions = model.predict(user_id)
312
-
313
- if len(predictions) == 0:
314
- return pd.DataFrame(columns=['movieId', 'title', 'predicted_rating'])
315
-
316
- # Get top N predictions
317
- top_n = predictions.nlargest(N)
318
-
319
- recommendations = pd.DataFrame({
320
- 'movieId': top_n.index,
321
- 'predicted_rating': top_n.values
322
- })
323
-
324
- # Merge with movie titles
325
- recommendations = recommendations.merge(
326
- movies_df[['movieId', 'title']],
327
- on='movieId',
328
- how='left'
329
- )
330
-
331
- return recommendations[['movieId', 'title', 'predicted_rating']]
332
-
333
- # ============================================================================
334
- # MAIN EXECUTION
335
- # ============================================================================
336
-
337
- def main():
338
- print("="*70)
339
- print("MOVIELENS RECOMMENDATION SYSTEM")
340
- print("="*70)
341
-
342
- # Load data
343
- print("\n[1/6] Loading data...")
344
- ratings, movies = load_movielens_data()
345
-
346
- # Split data
347
- print("\n[2/6] Splitting data (80% train, 20% test)...")
348
- train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
349
- print(f"Training set: {len(train_data)} ratings")
350
- print(f"Test set: {len(test_data)} ratings")
351
-
352
- # Create user-item matrix
353
- print("\n[3/6] Creating user-item matrix...")
354
- user_item_matrix = create_user_item_matrix(train_data)
355
-
356
- # Train User-Based CF
357
- print("\n[4/6] Training User-Based Collaborative Filtering...")
358
- user_cf = UserBasedCF(user_item_matrix)
359
- user_cf.fit()
360
- print("Evaluating User-Based CF...")
361
- metrics_user_cf = evaluate_model(user_cf, test_data, user_item_matrix)
362
- print(f"User-Based CF Results:")
363
- for metric, value in metrics_user_cf.items():
364
- print(f" {metric}: {value:.4f}")
365
-
366
- # Train Item-Based CF
367
- print("\n[5/6] Training Item-Based Collaborative Filtering...")
368
- item_cf = ItemBasedCF(user_item_matrix)
369
- item_cf.fit()
370
- print("Evaluating Item-Based CF...")
371
- metrics_item_cf = evaluate_model(item_cf, test_data, user_item_matrix)
372
- print(f"Item-Based CF Results:")
373
- for metric, value in metrics_item_cf.items():
374
- print(f" {metric}: {value:.4f}")
375
-
376
- # Train SVD
377
- print("\n[6/6] Training SVD (Matrix Factorization)...")
378
- svd = SVDRecommender(user_item_matrix, n_factors=50)
379
- svd.fit()
380
- print("Evaluating SVD...")
381
- metrics_svd = evaluate_model(svd, test_data, user_item_matrix)
382
- print(f"SVD Results:")
383
- for metric, value in metrics_svd.items():
384
- print(f" {metric}: {value:.4f}")
385
-
386
- # Model comparison
387
- print("\n" + "="*70)
388
- print("MODEL COMPARISON")
389
- print("="*70)
390
-
391
- comparison_df = pd.DataFrame({
392
- 'User-Based CF': metrics_user_cf,
393
- 'Item-Based CF': metrics_item_cf,
394
- 'SVD': metrics_svd
395
- })
396
- print(comparison_df.to_string())
397
-
398
- # Determine best model
399
- best_model_name = comparison_df.loc['NDCG@K'].idxmax()
400
- print(f"\n*** Best Model (by NDCG@K): {best_model_name} ***")
401
-
402
- if best_model_name == 'User-Based CF':
403
- best_model = user_cf
404
- elif best_model_name == 'Item-Based CF':
405
- best_model = item_cf
406
- else:
407
- best_model = svd
408
-
409
- # Example recommendations
410
- print("\n" + "="*70)
411
- print("EXAMPLE RECOMMENDATIONS")
412
- print("="*70)
413
-
414
- sample_user_id = user_item_matrix.index[0]
415
- print(f"\nTop 10 recommendations for User {sample_user_id} using {best_model_name}:")
416
-
417
- recommendations = recommend_movies(sample_user_id, 10, best_model, movies)
418
- print(recommendations.to_string(index=False))
419
-
420
- # Save models for deployment
421
- print("\n" + "="*70)
422
- print("SAVING MODELS FOR DEPLOYMENT")
423
- print("="*70)
424
-
425
- save_models_for_deployment(
426
- user_cf, item_cf, svd,
427
- user_item_matrix, movies,
428
- metrics_user_cf, metrics_item_cf, metrics_svd
429
- )
430
-
431
- return best_model, user_item_matrix, movies
432
-
433
- def save_models_for_deployment(user_cf, item_cf, svd, user_item_matrix, movies,
434
- metrics_user_cf, metrics_item_cf, metrics_svd):
435
- """Save all models and data for Hugging Face deployment"""
436
-
437
- output_dir = 'deployment_files'
438
- os.makedirs(output_dir, exist_ok=True)
439
-
440
- print(f"Saving models to {output_dir}/...")
441
-
442
- with open(f'{output_dir}/user_cf_model.pkl', 'wb') as f:
443
- pickle.dump(user_cf, f)
444
- print(" ✓ User-Based CF model saved")
445
-
446
- with open(f'{output_dir}/item_cf_model.pkl', 'wb') as f:
447
- pickle.dump(item_cf, f)
448
- print(" ✓ Item-Based CF model saved")
449
-
450
- with open(f'{output_dir}/svd_model.pkl', 'wb') as f:
451
- pickle.dump(svd, f)
452
- print(" ✓ SVD model saved")
453
-
454
- with open(f'{output_dir}/user_item_matrix.pkl', 'wb') as f:
455
- pickle.dump(user_item_matrix, f)
456
- print(" ✓ User-item matrix saved")
457
-
458
- metrics = {
459
- 'User-Based CF': metrics_user_cf,
460
- 'Item-Based CF': metrics_item_cf,
461
- 'SVD': metrics_svd
462
- }
463
-
464
- with open(f'{output_dir}/metrics.pkl', 'wb') as f:
465
- pickle.dump(metrics, f)
466
- print(" ✓ Metrics saved")
467
-
468
- movies.to_csv(f'{output_dir}/movies.csv', index=False)
469
- print(" ✓ Movies data saved")
470
-
471
- print("\nAll files ready for Hugging Face deployment!")
472
-
473
- if __name__ == "__main__":
474
- best_model, user_item_matrix, movies = main()
475
-
476
- import gradio as gr
477
- import pickle
478
- import pandas as pd
479
- import numpy as np
480
- import os
481
-
482
- # Determine file location
483
- BASE_DIR = 'deployment_files' if os.path.exists('deployment_files') else '.'
484
-
485
- # Load models and data
486
- print("Loading models...")
487
- with open(f'{BASE_DIR}/user_cf_model.pkl', 'rb') as f:
488
- user_cf = pickle.load(f)
489
-
490
- with open(f'{BASE_DIR}/item_cf_model.pkl', 'rb') as f:
491
- item_cf = pickle.load(f)
492
-
493
- with open(f'{BASE_DIR}/svd_model.pkl', 'rb') as f:
494
- svd = pickle.load(f)
495
-
496
- with open(f'{BASE_DIR}/user_item_matrix.pkl', 'rb') as f:
497
- user_item_matrix = pickle.load(f)
498
-
499
- movies = pd.read_csv(f'{BASE_DIR}/movies.csv')
500
-
501
- with open(f'{BASE_DIR}/metrics.pkl', 'rb') as f:
502
- metrics = pickle.load(f)
503
-
504
- MODELS = {
505
- 'User-Based CF': user_cf,
506
- 'Item-Based CF': item_cf,
507
- 'SVD': svd
508
- }
509
-
510
- print("Models loaded successfully!")
511
-
512
- def recommend_movies(user_id, N, model_name='SVD'):
513
- """Generate movie recommendations"""
514
  try:
515
  user_id = int(user_id)
516
- N = int(N)
517
-
518
- if user_id not in user_item_matrix.index:
519
- return pd.DataFrame({'Error': ['User ID not found in system']}), ""
520
-
521
- model = MODELS[model_name]
522
- predictions = model.predict(user_id)
523
-
524
- if len(predictions) == 0 or predictions.sum() == 0:
525
- return pd.DataFrame({'Error': ['No predictions available for this user']}), ""
526
-
527
- # Get top N recommendations
528
- top_n = predictions.nlargest(N)
529
 
530
- recommendations = pd.DataFrame({
531
- 'movieId': top_n.index,
532
- 'predicted_rating': top_n.values
533
- })
534
 
535
- # Add movie titles
536
- recommendations = recommendations.merge(
537
- movies[['movieId', 'title']],
538
- on='movieId',
539
- how='left'
540
- )
541
 
542
- result = recommendations[['movieId', 'title', 'predicted_rating']]
 
543
 
544
- # Format metrics
545
- metrics_text = f"""
546
- ### {model_name} Performance Metrics
547
-
548
- - **Precision@10**: {metrics[model_name]['Precision@K']:.4f}
549
- - **Recall@10**: {metrics[model_name]['Recall@K']:.4f}
550
- - **NDCG@10**: {metrics[model_name]['NDCG@K']:.4f}
551
-
552
- *Metrics evaluated on test set with relevance threshold = 4.0*
553
- """
554
 
555
- return result, metrics_text
556
 
 
 
557
  except Exception as e:
558
- return pd.DataFrame({'Error': [f'Error: {str(e)}']}), ""
559
 
560
- def show_model_comparison():
561
- """Display model comparison report"""
 
 
562
 
563
- # Determine best model
564
- ndcg_scores = {name: m['NDCG@K'] for name, m in metrics.items()}
565
- best_model = max(ndcg_scores, key=ndcg_scores.get)
 
 
 
566
 
567
- report = f"""
568
- # Model Comparison Report
569
-
570
- ## Performance Metrics
571
-
572
- | Model | Precision@10 | Recall@10 | NDCG@10 |
573
- |-------|--------------|-----------|---------|
574
- | User-Based CF | {metrics['User-Based CF']['Precision@K']:.4f} | {metrics['User-Based CF']['Recall@K']:.4f} | {metrics['User-Based CF']['NDCG@K']:.4f} |
575
- | Item-Based CF | {metrics['Item-Based CF']['Precision@K']:.4f} | {metrics['Item-Based CF']['Recall@K']:.4f} | {metrics['Item-Based CF']['NDCG@K']:.4f} |
576
- | SVD | {metrics['SVD']['Precision@K']:.4f} | {metrics['SVD']['Recall@K']:.4f} | {metrics['SVD']['NDCG@K']:.4f} |
577
-
578
- ## Best Model: {best_model}
579
-
580
- ### Why {best_model} Performs Best
581
-
582
- **Matrix Factorization (SVD) Advantages:**
583
- - Captures latent factors in user-movie interactions
584
- - Handles sparse data through dimensionality reduction
585
- - Generalizes better than similarity-based methods
586
- - Computationally efficient for prediction
587
-
588
- **Collaborative Filtering Trade-offs:**
589
- - **User-Based**: Intuitive but computationally expensive, struggles with sparsity
590
- - **Item-Based**: More stable than user-based, but limited to similar items
591
- - **SVD**: Best balance of accuracy and efficiency
592
-
593
- ### Implementation Details
594
-
595
- - **SVD**: 50 latent factors via Singular Value Decomposition
596
- - **CF**: Cosine similarity with k=50 neighbors
597
- - **Evaluation**: 80/20 train-test split, threshold=4.0 for relevance
598
- - **Metrics**: Precision, Recall, and NDCG at K=10
599
-
600
- ### Conclusion
601
-
602
- SVD achieves the best performance by learning compressed representations of user preferences
603
- and movie characteristics, making it the recommended approach for production deployment.
604
- """
605
 
606
- return report
607
 
608
- def get_dataset_info():
609
- """Display dataset statistics"""
610
- min_user = int(user_item_matrix.index.min())
611
- max_user = int(user_item_matrix.index.max())
612
- num_users = len(user_item_matrix.index)
613
- num_movies = len(movies)
614
-
615
- info = f"""
616
- ### Dataset Information
617
-
618
- - **Total Users**: {num_users:,}
619
- - **Total Movies**: {num_movies:,}
620
- - **User ID Range**: {min_user} to {max_user}
621
- - **Rating Scale**: 0.5 to 5.0 stars
622
- - **Source**: MovieLens Dataset
623
- """
624
- return info
625
-
626
- # Build Gradio Interface
627
- with gr.Blocks(title="MovieLens Recommendation System", theme=gr.themes.Soft()) as demo:
628
-
629
- gr.Markdown("""
630
- # 🎬 MovieLens Recommendation System
631
- ## DataSynthis_ML_JobTask
632
-
633
- Compare three recommendation algorithms: User-Based CF, Item-Based CF, and SVD Matrix Factorization
634
- """)
635
 
636
- with gr.Tab("🎯 Get Recommendations"):
637
- gr.Markdown(get_dataset_info())
638
-
639
  with gr.Row():
640
  with gr.Column():
641
- user_id_input = gr.Number(
642
- label="User ID",
643
- value=1,
644
- precision=0,
645
- info="Enter a valid user ID from the dataset"
646
  )
647
- n_input = gr.Number(
648
- label="Number of Recommendations (N)",
649
- value=10,
650
- precision=0,
651
- info="How many movies to recommend (1-20)"
 
652
  )
653
- model_select = gr.Dropdown(
654
- choices=['User-Based CF', 'Item-Based CF', 'SVD'],
 
 
 
 
 
 
 
655
  value='SVD',
656
- label="Recommendation Algorithm",
657
- info="Select which model to use"
 
 
 
 
 
 
 
658
  )
659
-
660
- recommend_btn = gr.Button("🎬 Get Recommendations", variant="primary", size="lg")
661
-
662
- recommendations_output = gr.Dataframe(
663
- label="📋 Recommended Movies",
664
- wrap=True
665
- )
666
-
667
- metrics_output = gr.Markdown(label="📊 Model Performance")
668
 
669
  recommend_btn.click(
670
- fn=recommend_movies,
671
- inputs=[user_id_input, n_input, model_select],
672
- outputs=[recommendations_output, metrics_output]
673
  )
674
 
675
- with gr.Tab("📊 Model Comparison"):
676
- gr.Markdown(show_model_comparison())
 
 
 
 
 
677
 
678
- with gr.Tab("ℹ️ Documentation"):
679
  gr.Markdown("""
680
- ## Implementation Overview
681
-
682
- ### Algorithms
683
-
684
- **1. User-Based Collaborative Filtering**
685
- - Finds users with similar rating patterns
686
- - Recommends items liked by similar users
687
- - Uses cosine similarity with k=50 neighbors
688
 
689
- **2. Item-Based Collaborative Filtering**
690
- - Finds items similar to those the user has rated
691
- - Recommends items similar to user's preferences
692
- - Uses cosine similarity with k=50 neighbors
693
 
694
- **3. Singular Value Decomposition (SVD)**
695
- - Matrix factorization with 50 latent factors
696
- - Learns low-dimensional representations of users and items
697
- - Predicts ratings via reconstructed matrix
698
 
699
- ### Evaluation Metrics
 
 
700
 
701
- - **Precision@K**: Fraction of recommended items that are relevant
702
- - **Recall@K**: Fraction of relevant items that are recommended
703
- - **NDCG@K**: Normalized Discounted Cumulative Gain (considers ranking order)
704
 
705
- ### Technical Stack
 
 
706
 
707
- - Python 3.10+
708
- - NumPy, Pandas for data processing
709
- - SciPy for SVD computation
710
- - Scikit-learn for similarity metrics
711
- - Gradio for web interface
712
 
713
- ### Dataset
 
 
714
 
715
- - Source: MovieLens
716
- - Split: 80% training, 20% testing
717
- - Relevance Threshold: 4.0 stars
718
 
719
- ---
 
 
 
 
720
 
721
- **Project**: DataSynthis ML Job Task
722
- **Task**: Movie Recommendation System
723
  """)
724
 
725
  demo.launch()
 
 
 
 
 
 
1
  import pandas as pd
2
+ import numpy as np
3
+ from surprise import SVD, SVDpp, NMF, KNNBasic, Dataset, Reader
4
+ from surprise.model_selection import train_test_split, GridSearchCV
5
+ from collections import defaultdict
6
+ import gradio as gr
7
  import pickle
8
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ class MovieRecommenderEnsemble:
11
+ def __init__(self, ratings_path, movies_path):
12
+ print("Loading data...")
13
+ self.ratings = pd.read_csv(ratings_path)
14
+ self.movies = pd.read_csv(movies_path)
15
+
16
+ # Prepare Surprise dataset
17
+ reader = Reader(rating_scale=(0.5, 5.0))
18
+ self.data = Dataset.load_from_df(
19
+ self.ratings[['userId', 'movieId', 'rating']],
20
+ reader
21
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ # Train-test split for evaluation
24
+ self.trainset, self.testset = train_test_split(self.data, test_size=0.2)
25
 
26
+ # Initialize models
27
+ self.models = {}
28
+ self.train_all_models()
29
 
30
+ def train_all_models(self):
31
+ """Train all models with optimal hyperparameters for MovieLens 1M"""
 
 
 
 
 
 
 
 
 
 
32
 
33
+ print("\n" + "="*50)
34
+ print("Training User-Based Collaborative Filtering...")
35
+ print("="*50)
 
 
 
36
 
37
+ # User-Based CF - Optimal for 1M dataset
38
+ user_based_options = {
39
+ 'name': 'cosine',
40
+ 'user_based': True,
41
+ 'min_support': 5
42
+ }
43
+ self.models['user_based_cf'] = KNNBasic(
44
+ k=50,
45
+ sim_options=user_based_options
46
+ )
47
+ self.models['user_based_cf'].fit(self.trainset)
48
+ print("✓ User-Based CF trained")
49
+
50
+ print("\n" + "="*50)
51
+ print("Training Item-Based Collaborative Filtering...")
52
+ print("="*50)
53
+
54
+ # Item-Based CF - Optimal for 1M dataset
55
+ item_based_options = {
56
+ 'name': 'cosine',
57
+ 'user_based': False,
58
+ 'min_support': 5
59
+ }
60
+ self.models['item_based_cf'] = KNNBasic(
61
+ k=40,
62
+ sim_options=item_based_options
63
+ )
64
+ self.models['item_based_cf'].fit(self.trainset)
65
+ print("✓ Item-Based CF trained")
66
+
67
+ print("\n" + "="*50)
68
+ print("Training SVD (Matrix Factorization)...")
69
+ print("="*50)
70
+
71
+ # SVD - Tuned for 1M dataset
72
+ self.models['svd'] = SVD(
73
+ n_factors=150,
74
+ n_epochs=30,
75
+ lr_all=0.007,
76
+ reg_all=0.05,
77
+ random_state=42,
78
+ verbose=True
79
+ )
80
+ self.models['svd'].fit(self.trainset)
81
+ print("✓ SVD trained")
82
+
83
+ print("\n" + "="*50)
84
+ print("Training SVD++ (Enhanced Matrix Factorization)...")
85
+ print("="*50)
86
+
87
+ # SVD++ - Includes implicit feedback
88
+ self.models['svdpp'] = SVDpp(
89
+ n_factors=100,
90
+ n_epochs=20,
91
+ lr_all=0.007,
92
+ reg_all=0.05,
93
+ random_state=42,
94
+ verbose=True
95
+ )
96
+ self.models['svdpp'].fit(self.trainset)
97
+ print("✓ SVD++ trained")
98
+
99
+ print("\n" + "="*50)
100
+ print("Training NMF (Non-negative Matrix Factorization)...")
101
+ print("="*50)
102
+
103
+ # NMF - Alternative factorization
104
+ self.models['nmf'] = NMF(
105
+ n_factors=50,
106
+ n_epochs=50,
107
+ random_state=42,
108
+ verbose=True
109
+ )
110
+ self.models['nmf'].fit(self.trainset)
111
+ print("✓ NMF trained")
112
 
113
+ print("\n" + "="*50)
114
+ print("All models trained successfully!")
115
+ print("="*50)
116
 
117
+ def evaluate_models(self):
118
+ """Evaluate all models on test set"""
119
+ print("\n" + "="*50)
120
+ print("EVALUATING ALL MODELS")
121
+ print("="*50)
122
 
123
+ results = {}
124
 
125
+ for name, model in self.models.items():
126
+ print(f"\nEvaluating {name.upper()}...")
 
127
 
128
+ # Get predictions
129
+ predictions = model.test(self.testset)
130
 
131
+ # Calculate RMSE and MAE
132
+ rmse = self.calculate_rmse(predictions)
133
+ mae = self.calculate_mae(predictions)
134
+
135
+ # Calculate Precision@10, Recall@10, NDCG@10
136
+ precision, recall, ndcg = self.calculate_ranking_metrics(predictions, k=10)
137
+
138
+ results[name] = {
139
+ 'RMSE': rmse,
140
+ 'MAE': mae,
141
+ 'Precision@10': precision,
142
+ 'Recall@10': recall,
143
+ 'NDCG@10': ndcg
144
+ }
145
+
146
+ print(f" RMSE: {rmse:.4f}")
147
+ print(f" MAE: {mae:.4f}")
148
+ print(f" Precision@10: {precision:.4f}")
149
+ print(f" Recall@10: {recall:.4f}")
150
+ print(f" NDCG@10: {ndcg:.4f}")
151
+
152
+ # Determine best model
153
+ best_model = max(results.items(), key=lambda x: x[1]['Precision@10'])
154
+ print(f"\n{'='*50}")
155
+ print(f"BEST MODEL: {best_model[0].upper()}")
156
+ print(f"Precision@10: {best_model[1]['Precision@10']:.4f}")
157
+ print(f"{'='*50}\n")
158
+
159
+ return results, best_model[0]
160
+
161
+ def calculate_rmse(self, predictions):
162
+ """Calculate Root Mean Square Error"""
163
+ mse = np.mean([(pred.est - pred.r_ui)**2 for pred in predictions])
164
+ return np.sqrt(mse)
165
+
166
+ def calculate_mae(self, predictions):
167
+ """Calculate Mean Absolute Error"""
168
+ return np.mean([abs(pred.est - pred.r_ui) for pred in predictions])
169
+
170
+ def calculate_ranking_metrics(self, predictions, k=10, threshold=4.0):
171
+ """Calculate Precision@K, Recall@K, and NDCG@K"""
172
+
173
+ # Organize predictions by user
174
+ user_est_true = defaultdict(list)
175
+ for uid, _, true_r, est, _ in predictions:
176
+ user_est_true[uid].append((est, true_r))
177
+
178
+ precisions = []
179
+ recalls = []
180
+ ndcgs = []
181
+
182
+ for uid, user_ratings in user_est_true.items():
183
+ # Sort by estimated rating
184
+ user_ratings.sort(key=lambda x: x[0], reverse=True)
185
+
186
+ # Top k predictions
187
+ top_k = user_ratings[:k]
188
+
189
+ # Calculate metrics
190
+ n_rel = sum(1 for (_, true_r) in user_ratings if true_r >= threshold)
191
+ n_rec_k = sum(1 for (est, _) in top_k if est >= threshold)
192
+ n_rel_and_rec_k = sum(1 for (est, true_r) in top_k
193
+ if true_r >= threshold and est >= threshold)
194
+
195
+ # Precision@K
196
+ precision = n_rel_and_rec_k / k if k > 0 else 0
197
+ precisions.append(precision)
198
+
199
+ # Recall@K
200
+ recall = n_rel_and_rec_k / n_rel if n_rel > 0 else 0
201
+ recalls.append(recall)
202
+
203
+ # NDCG@K
204
+ dcg = sum((2**true_r - 1) / np.log2(i + 2)
205
+ for i, (est, true_r) in enumerate(top_k) if true_r >= threshold)
206
+ ideal_ratings = sorted([true_r for _, true_r in user_ratings], reverse=True)[:k]
207
+ idcg = sum((2**true_r - 1) / np.log2(i + 2)
208
+ for i, true_r in enumerate(ideal_ratings) if true_r >= threshold)
209
+ ndcg = dcg / idcg if idcg > 0 else 0
210
+ ndcgs.append(ndcg)
211
+
212
+ return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)
213
+
214
+ def recommend_movies(self, user_id, N, model_name='svd'):
215
+ """
216
+ Recommend top N movies for a user using specified model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
+ Args:
219
+ user_id: User ID
220
+ N: Number of recommendations
221
+ model_name: 'user_based_cf', 'item_based_cf', 'svd', 'svdpp', 'nmf', or 'ensemble'
222
+ """
223
 
224
+ if model_name == 'ensemble':
225
+ return self.recommend_ensemble(user_id, N)
226
+
227
+ if model_name not in self.models:
228
+ return f"Model '{model_name}' not found. Available: {list(self.models.keys())}"
229
+
230
+ model = self.models[model_name]
231
+
232
+ # Get all movies
233
+ all_movies = self.movies['movieId'].unique()
234
+
235
+ # Get movies user has rated
236
+ rated_movies = self.ratings[self.ratings['userId'] == user_id]['movieId'].values
237
+
238
+ # Get unrated movies
239
+ unrated_movies = [m for m in all_movies if m not in rated_movies]
240
+
241
+ # Predict ratings
242
+ predictions = []
243
+ for movie_id in unrated_movies:
244
+ pred = model.predict(user_id, movie_id)
245
+ predictions.append((movie_id, pred.est))
246
+
247
+ # Sort by predicted rating
248
+ predictions.sort(key=lambda x: x[1], reverse=True)
249
+
250
+ # Get top N
251
+ top_n = predictions[:N]
252
+
253
+ # Format results
254
+ results = []
255
+ for i, (movie_id, score) in enumerate(top_n, 1):
256
+ movie_info = self.movies[self.movies['movieId'] == movie_id]
257
+ if len(movie_info) > 0:
258
+ title = movie_info['title'].iloc[0]
259
+ genres = movie_info['genres'].iloc[0] if 'genres' in movie_info else 'N/A'
260
+ results.append({
261
+ 'rank': i,
262
+ 'movieId': int(movie_id),
263
+ 'title': title,
264
+ 'genres': genres,
265
+ 'predicted_rating': round(score, 2)
266
+ })
267
+
268
+ return results
269
+
270
+ def recommend_ensemble(self, user_id, N):
271
+ """Ensemble recommendation using weighted average of all models"""
272
+
273
+ # Get all movies
274
+ all_movies = self.movies['movieId'].unique()
275
+ rated_movies = self.ratings[self.ratings['userId'] == user_id]['movieId'].values
276
+ unrated_movies = [m for m in all_movies if m not in rated_movies]
277
+
278
+ # Model weights (based on typical performance)
279
+ weights = {
280
+ 'user_based_cf': 0.20,
281
+ 'item_based_cf': 0.20,
282
+ 'svd': 0.25,
283
+ 'svdpp': 0.25,
284
+ 'nmf': 0.10
285
  }
286
+
287
+ # Aggregate predictions
288
+ movie_scores = defaultdict(float)
289
+
290
+ for movie_id in unrated_movies:
291
+ weighted_sum = 0
292
+ for model_name, model in self.models.items():
293
+ pred = model.predict(user_id, movie_id).est
294
+ weighted_sum += pred * weights[model_name]
295
+ movie_scores[movie_id] = weighted_sum
296
+
297
+ # Sort and get top N
298
+ sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)[:N]
299
+
300
+ # Format results
301
+ results = []
302
+ for i, (movie_id, score) in enumerate(sorted_movies, 1):
303
+ movie_info = self.movies[self.movies['movieId'] == movie_id]
304
+ if len(movie_info) > 0:
305
+ title = movie_info['title'].iloc[0]
306
+ genres = movie_info['genres'].iloc[0] if 'genres' in movie_info else 'N/A'
307
+ results.append({
308
+ 'rank': i,
309
+ 'movieId': int(movie_id),
310
+ 'title': title,
311
+ 'genres': genres,
312
+ 'predicted_rating': round(score, 2)
313
+ })
314
+
315
+ return results
316
+
317
+ # Initialize recommender system
318
+ print("Initializing MovieLens Recommendation System...")
319
+ recommender = MovieRecommenderEnsemble('ratings.csv', 'movies.csv')
320
+
321
+ # Evaluate all models
322
+ evaluation_results, best_model_name = recommender.evaluate_models()
323
+
324
+ # Create Gradio interface
325
+ def recommend_interface(user_id, n_recommendations, model_choice):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  try:
327
  user_id = int(user_id)
328
+ n_recommendations = int(n_recommendations)
329
+
330
+ # Map display names to internal names
331
+ model_map = {
332
+ 'User-Based CF': 'user_based_cf',
333
+ 'Item-Based CF': 'item_based_cf',
334
+ 'SVD': 'svd',
335
+ 'SVD++': 'svdpp',
336
+ 'NMF': 'nmf',
337
+ 'Ensemble (All Models)': 'ensemble'
338
+ }
 
 
339
 
340
+ model_name = model_map.get(model_choice, 'svd')
 
 
 
341
 
342
+ recommendations = recommender.recommend_movies(user_id, n_recommendations, model_name)
 
 
 
 
 
343
 
344
+ if isinstance(recommendations, str):
345
+ return recommendations
346
 
347
+ # Format output
348
+ output = f"Top {n_recommendations} recommendations for User {user_id} using {model_choice}:\n\n"
349
+ for rec in recommendations:
350
+ output += f"{rec['rank']}. {rec['title']}\n"
351
+ output += f" Genres: {rec['genres']}\n"
352
+ output += f" Predicted Rating: {rec['predicted_rating']}/5.0\n\n"
 
 
 
 
353
 
354
+ return output
355
 
356
+ except ValueError:
357
+ return "Error: Please enter a valid user ID"
358
  except Exception as e:
359
+ return f"Error: {str(e)}"
360
 
361
+ def show_evaluation():
362
+ """Display evaluation results"""
363
+ output = "MODEL EVALUATION RESULTS\n"
364
+ output += "="*60 + "\n\n"
365
 
366
+ for model_name, metrics in evaluation_results.items():
367
+ output += f"{model_name.upper().replace('_', ' ')}\n"
368
+ output += "-"*40 + "\n"
369
+ for metric, value in metrics.items():
370
+ output += f" {metric}: {value:.4f}\n"
371
+ output += "\n"
372
 
373
+ output += "="*60 + "\n"
374
+ output += f"BEST MODEL: {best_model_name.upper().replace('_', ' ')}\n"
375
+ output += "="*60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
+ return output
378
 
379
+ # Create Gradio interface
380
+ with gr.Blocks(title="MovieLens Recommendation System") as demo:
381
+ gr.Markdown("# 🎬 MovieLens Recommendation System")
382
+ gr.Markdown("### Trained on MovieLens 1M Dataset (6,040 users, 3,706 movies)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
+ with gr.Tab("Get Recommendations"):
 
 
385
  with gr.Row():
386
  with gr.Column():
387
+ user_input = gr.Textbox(
388
+ label="User ID",
389
+ placeholder="Enter user ID (1-6040)",
390
+ value="1"
 
391
  )
392
+ n_input = gr.Slider(
393
+ minimum=1,
394
+ maximum=20,
395
+ value=10,
396
+ step=1,
397
+ label="Number of Recommendations"
398
  )
399
+ model_input = gr.Dropdown(
400
+ choices=[
401
+ 'User-Based CF',
402
+ 'Item-Based CF',
403
+ 'SVD',
404
+ 'SVD++',
405
+ 'NMF',
406
+ 'Ensemble (All Models)'
407
+ ],
408
  value='SVD',
409
+ label="Select Model"
410
+ )
411
+ recommend_btn = gr.Button("Get Recommendations", variant="primary")
412
+
413
+ with gr.Column():
414
+ output = gr.Textbox(
415
+ label="Recommendations",
416
+ lines=20,
417
+ max_lines=30
418
  )
 
 
 
 
 
 
 
 
 
419
 
420
  recommend_btn.click(
421
+ fn=recommend_interface,
422
+ inputs=[user_input, n_input, model_input],
423
+ outputs=output
424
  )
425
 
426
+ with gr.Tab("Model Evaluation"):
427
+ gr.Markdown("## Performance Comparison of All Models")
428
+ eval_output = gr.Textbox(
429
+ label="Evaluation Metrics",
430
+ lines=25,
431
+ value=show_evaluation()
432
+ )
433
 
434
+ with gr.Tab("About"):
435
  gr.Markdown("""
436
+ ## About This System
 
 
 
 
 
 
 
437
 
438
+ This recommendation system implements multiple collaborative filtering approaches:
 
 
 
439
 
440
+ ### Models Implemented:
 
 
 
441
 
442
+ 1. **User-Based Collaborative Filtering**
443
+ - Finds similar users based on rating patterns
444
+ - k=50 neighbors, cosine similarity
445
 
446
+ 2. **Item-Based Collaborative Filtering**
447
+ - Recommends items similar to those you liked
448
+ - k=40 neighbors, cosine similarity
449
 
450
+ 3. **SVD (Singular Value Decomposition)**
451
+ - Matrix factorization with 150 latent factors
452
+ - 30 epochs, optimized for MovieLens 1M
453
 
454
+ 4. **SVD++ (Enhanced SVD)**
455
+ - Includes implicit feedback signals
456
+ - 100 factors, 20 epochs
 
 
457
 
458
+ 5. **NMF (Non-negative Matrix Factorization)**
459
+ - Alternative factorization method
460
+ - 50 factors, 50 epochs
461
 
462
+ 6. **Ensemble**
463
+ - Weighted combination of all models
464
+ - Leverages strengths of each approach
465
 
466
+ ### Evaluation Metrics:
467
+ - **RMSE/MAE**: Prediction accuracy
468
+ - **Precision@10**: Relevance of top 10 recommendations
469
+ - **Recall@10**: Coverage of relevant items
470
+ - **NDCG@10**: Ranking quality
471
 
472
+ ### Dataset:
473
+ MovieLens 1M - 1 million ratings from 6,040 users on 3,706 movies
474
  """)
475
 
476
  demo.launch()