Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,12 +4,12 @@ from scipy.sparse.linalg import svds
|
|
| 4 |
from scipy.sparse import csr_matrix
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
| 7 |
import warnings
|
| 8 |
warnings.filterwarnings('ignore')
|
| 9 |
|
| 10 |
-
# ============================================================================
|
| 11 |
# DATA LOADING & PREPROCESSING
|
| 12 |
-
# ============================================================================
|
| 13 |
|
| 14 |
def load_movielens_data(ratings_path='ratings.csv', movies_path='movies.csv'):
|
| 15 |
"""Load and prepare MovieLens data"""
|
|
@@ -26,9 +26,7 @@ def create_user_item_matrix(ratings):
|
|
| 26 |
).fillna(0)
|
| 27 |
return user_item_matrix
|
| 28 |
|
| 29 |
-
# ============================================================================
|
| 30 |
# COLLABORATIVE FILTERING - USER BASED
|
| 31 |
-
# ============================================================================
|
| 32 |
|
| 33 |
class UserBasedCF:
|
| 34 |
def __init__(self, user_item_matrix):
|
|
@@ -60,9 +58,7 @@ class UserBasedCF:
|
|
| 60 |
|
| 61 |
return predictions
|
| 62 |
|
| 63 |
-
# ============================================================================
|
| 64 |
# COLLABORATIVE FILTERING - ITEM BASED
|
| 65 |
-
# ============================================================================
|
| 66 |
|
| 67 |
class ItemBasedCF:
|
| 68 |
def __init__(self, user_item_matrix):
|
|
@@ -98,9 +94,7 @@ class ItemBasedCF:
|
|
| 98 |
predictions[user_ratings > 0] = 0
|
| 99 |
return predictions
|
| 100 |
|
| 101 |
-
# ============================================================================
|
| 102 |
# MATRIX FACTORIZATION - SVD
|
| 103 |
-
# ============================================================================
|
| 104 |
|
| 105 |
class SVDRecommender:
|
| 106 |
def __init__(self, user_item_matrix, n_factors=50):
|
|
@@ -138,9 +132,7 @@ class SVDRecommender:
|
|
| 138 |
|
| 139 |
return user_predictions
|
| 140 |
|
| 141 |
-
# ============================================================================
|
| 142 |
# EVALUATION METRICS
|
| 143 |
-
# ============================================================================
|
| 144 |
|
| 145 |
def precision_at_k(recommended, relevant, k):
|
| 146 |
"""Calculate Precision@K"""
|
|
@@ -168,7 +160,7 @@ def evaluate_model(model, test_data, user_item_matrix, k=10, threshold=4.0):
|
|
| 168 |
"""Evaluate model on test set"""
|
| 169 |
precisions, recalls, ndcgs = [], [], []
|
| 170 |
|
| 171 |
-
test_users = test_data['userId'].unique()[:100]
|
| 172 |
|
| 173 |
for user_id in test_users:
|
| 174 |
if user_id not in user_item_matrix.index:
|
|
@@ -196,9 +188,7 @@ def evaluate_model(model, test_data, user_item_matrix, k=10, threshold=4.0):
|
|
| 196 |
'NDCG@K': np.mean(ndcgs)
|
| 197 |
}
|
| 198 |
|
| 199 |
-
#
|
| 200 |
-
# RECOMMENDATION FUNCTION
|
| 201 |
-
# ============================================================================
|
| 202 |
|
| 203 |
def recommend_movies(user_id, N, model, movies_df):
|
| 204 |
"""
|
|
@@ -227,9 +217,7 @@ def recommend_movies(user_id, N, model, movies_df):
|
|
| 227 |
recommendations = recommendations.merge(movies_df[['movieId', 'title']], on='movieId')
|
| 228 |
return recommendations[['movieId', 'title', 'predicted_rating']]
|
| 229 |
|
| 230 |
-
# ============================================================================
|
| 231 |
# MAIN EXECUTION PIPELINE
|
| 232 |
-
# ============================================================================
|
| 233 |
|
| 234 |
def main():
|
| 235 |
print("Loading data...")
|
|
@@ -271,7 +259,7 @@ def main():
|
|
| 271 |
})
|
| 272 |
print(comparison)
|
| 273 |
|
| 274 |
-
# Select best model
|
| 275 |
best_model_name = comparison.loc['NDCG@K'].idxmax()
|
| 276 |
print(f"\nBest Model: {best_model_name}")
|
| 277 |
|
|
@@ -291,55 +279,96 @@ def main():
|
|
| 291 |
print(f"\nTop 10 recommendations for User {sample_user}:")
|
| 292 |
print(recommendations.to_string(index=False))
|
| 293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
return best_model, user_item_matrix, movies
|
| 295 |
|
| 296 |
-
|
| 297 |
-
best_model, user_item_matrix, movies = main()
|
| 298 |
-
|
| 299 |
-
# save_model.py
|
| 300 |
-
import pickle
|
| 301 |
-
import os
|
| 302 |
|
| 303 |
-
def
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
| 305 |
os.makedirs(output_dir, exist_ok=True)
|
| 306 |
|
| 307 |
-
with open(f'{output_dir}/
|
| 308 |
-
pickle.dump(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
with open(f'{output_dir}/user_item_matrix.pkl', 'wb') as f:
|
| 311 |
pickle.dump(user_item_matrix, f)
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
movies.to_csv(f'{output_dir}/movies.csv', index=False)
|
| 314 |
|
| 315 |
-
print(f"
|
|
|
|
| 316 |
|
| 317 |
-
|
| 318 |
-
|
| 319 |
|
| 320 |
import gradio as gr
|
| 321 |
import pickle
|
| 322 |
import pandas as pd
|
|
|
|
| 323 |
|
| 324 |
-
# Load
|
| 325 |
-
with open('
|
| 326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
|
| 328 |
with open('user_item_matrix.pkl', 'rb') as f:
|
| 329 |
user_item_matrix = pickle.load(f)
|
| 330 |
|
| 331 |
movies = pd.read_csv('movies.csv')
|
| 332 |
|
| 333 |
-
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
try:
|
| 336 |
user_id = int(user_id)
|
| 337 |
N = int(N)
|
| 338 |
|
|
|
|
|
|
|
| 339 |
if user_id not in user_item_matrix.index:
|
| 340 |
-
return "User ID not found"
|
| 341 |
|
| 342 |
predictions = model.predict(user_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
top_n = predictions.sort_values(ascending=False).head(N)
|
| 344 |
|
| 345 |
recommendations = pd.DataFrame({
|
|
@@ -348,20 +377,195 @@ def recommend_movies(user_id, N):
|
|
| 348 |
})
|
| 349 |
|
| 350 |
recommendations = recommendations.merge(movies[['movieId', 'title']], on='movieId')
|
| 351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
|
| 353 |
except Exception as e:
|
| 354 |
-
return f"Error: {str(e)}"
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from scipy.sparse import csr_matrix
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
from sklearn.model_selection import train_test_split
|
| 7 |
+
import pickle
|
| 8 |
+
import os
|
| 9 |
import warnings
|
| 10 |
warnings.filterwarnings('ignore')
|
| 11 |
|
|
|
|
| 12 |
# DATA LOADING & PREPROCESSING
|
|
|
|
| 13 |
|
| 14 |
def load_movielens_data(ratings_path='ratings.csv', movies_path='movies.csv'):
|
| 15 |
"""Load and prepare MovieLens data"""
|
|
|
|
| 26 |
).fillna(0)
|
| 27 |
return user_item_matrix
|
| 28 |
|
|
|
|
| 29 |
# COLLABORATIVE FILTERING - USER BASED
|
|
|
|
| 30 |
|
| 31 |
class UserBasedCF:
|
| 32 |
def __init__(self, user_item_matrix):
|
|
|
|
| 58 |
|
| 59 |
return predictions
|
| 60 |
|
|
|
|
| 61 |
# COLLABORATIVE FILTERING - ITEM BASED
|
|
|
|
| 62 |
|
| 63 |
class ItemBasedCF:
|
| 64 |
def __init__(self, user_item_matrix):
|
|
|
|
| 94 |
predictions[user_ratings > 0] = 0
|
| 95 |
return predictions
|
| 96 |
|
|
|
|
| 97 |
# MATRIX FACTORIZATION - SVD
|
|
|
|
| 98 |
|
| 99 |
class SVDRecommender:
|
| 100 |
def __init__(self, user_item_matrix, n_factors=50):
|
|
|
|
| 132 |
|
| 133 |
return user_predictions
|
| 134 |
|
|
|
|
| 135 |
# EVALUATION METRICS
|
|
|
|
| 136 |
|
| 137 |
def precision_at_k(recommended, relevant, k):
|
| 138 |
"""Calculate Precision@K"""
|
|
|
|
| 160 |
"""Evaluate model on test set"""
|
| 161 |
precisions, recalls, ndcgs = [], [], []
|
| 162 |
|
| 163 |
+
test_users = test_data['userId'].unique()[:100]
|
| 164 |
|
| 165 |
for user_id in test_users:
|
| 166 |
if user_id not in user_item_matrix.index:
|
|
|
|
| 188 |
'NDCG@K': np.mean(ndcgs)
|
| 189 |
}
|
| 190 |
|
| 191 |
+
# RECOMMENDATION FUNCTION (REQUIRED DELIVERABLE)
|
|
|
|
|
|
|
| 192 |
|
| 193 |
def recommend_movies(user_id, N, model, movies_df):
|
| 194 |
"""
|
|
|
|
| 217 |
recommendations = recommendations.merge(movies_df[['movieId', 'title']], on='movieId')
|
| 218 |
return recommendations[['movieId', 'title', 'predicted_rating']]
|
| 219 |
|
|
|
|
| 220 |
# MAIN EXECUTION PIPELINE
|
|
|
|
| 221 |
|
| 222 |
def main():
|
| 223 |
print("Loading data...")
|
|
|
|
| 259 |
})
|
| 260 |
print(comparison)
|
| 261 |
|
| 262 |
+
# Select best model
|
| 263 |
best_model_name = comparison.loc['NDCG@K'].idxmax()
|
| 264 |
print(f"\nBest Model: {best_model_name}")
|
| 265 |
|
|
|
|
| 279 |
print(f"\nTop 10 recommendations for User {sample_user}:")
|
| 280 |
print(recommendations.to_string(index=False))
|
| 281 |
|
| 282 |
+
# Save all models for deployment
|
| 283 |
+
save_all_for_deployment(user_cf, item_cf, svd, user_item_matrix, movies,
|
| 284 |
+
metrics_user_cf, metrics_item_cf, metrics_svd)
|
| 285 |
+
|
| 286 |
return best_model, user_item_matrix, movies
|
| 287 |
|
| 288 |
+
# SAVE MODELS FOR DEPLOYMENT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
+
def save_all_for_deployment(user_cf, item_cf, svd, user_item_matrix, movies,
|
| 291 |
+
metrics_user_cf, metrics_item_cf, metrics_svd):
|
| 292 |
+
"""Save everything needed for Hugging Face deployment"""
|
| 293 |
+
|
| 294 |
+
output_dir = 'deployment_files'
|
| 295 |
os.makedirs(output_dir, exist_ok=True)
|
| 296 |
|
| 297 |
+
with open(f'{output_dir}/user_cf_model.pkl', 'wb') as f:
|
| 298 |
+
pickle.dump(user_cf, f)
|
| 299 |
+
|
| 300 |
+
with open(f'{output_dir}/item_cf_model.pkl', 'wb') as f:
|
| 301 |
+
pickle.dump(item_cf, f)
|
| 302 |
+
|
| 303 |
+
with open(f'{output_dir}/svd_model.pkl', 'wb') as f:
|
| 304 |
+
pickle.dump(svd, f)
|
| 305 |
|
| 306 |
with open(f'{output_dir}/user_item_matrix.pkl', 'wb') as f:
|
| 307 |
pickle.dump(user_item_matrix, f)
|
| 308 |
|
| 309 |
+
with open(f'{output_dir}/metrics.pkl', 'wb') as f:
|
| 310 |
+
pickle.dump({
|
| 311 |
+
'User-Based CF': metrics_user_cf,
|
| 312 |
+
'Item-Based CF': metrics_item_cf,
|
| 313 |
+
'SVD': metrics_svd
|
| 314 |
+
}, f)
|
| 315 |
+
|
| 316 |
movies.to_csv(f'{output_dir}/movies.csv', index=False)
|
| 317 |
|
| 318 |
+
print(f"\nAll models and data saved to {output_dir}/")
|
| 319 |
+
print("Ready for Hugging Face deployment")
|
| 320 |
|
| 321 |
+
if __name__ == "__main__":
|
| 322 |
+
best_model, user_item_matrix, movies = main()
|
| 323 |
|
| 324 |
import gradio as gr
|
| 325 |
import pickle
|
| 326 |
import pandas as pd
|
| 327 |
+
import numpy as np
|
| 328 |
|
| 329 |
+
# Load all models
|
| 330 |
+
with open('user_cf_model.pkl', 'rb') as f:
|
| 331 |
+
user_cf = pickle.load(f)
|
| 332 |
+
|
| 333 |
+
with open('item_cf_model.pkl', 'rb') as f:
|
| 334 |
+
item_cf = pickle.load(f)
|
| 335 |
+
|
| 336 |
+
with open('svd_model.pkl', 'rb') as f:
|
| 337 |
+
svd = pickle.load(f)
|
| 338 |
|
| 339 |
with open('user_item_matrix.pkl', 'rb') as f:
|
| 340 |
user_item_matrix = pickle.load(f)
|
| 341 |
|
| 342 |
movies = pd.read_csv('movies.csv')
|
| 343 |
|
| 344 |
+
with open('metrics.pkl', 'rb') as f:
|
| 345 |
+
metrics = pickle.load(f)
|
| 346 |
+
|
| 347 |
+
MODELS = {
|
| 348 |
+
'User-Based CF': user_cf,
|
| 349 |
+
'Item-Based CF': item_cf,
|
| 350 |
+
'SVD': svd
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
def recommend_movies(user_id, N, model_name='SVD'):
|
| 354 |
+
"""
|
| 355 |
+
Recommend top N movies for user
|
| 356 |
+
Required function signature matching specifications
|
| 357 |
+
"""
|
| 358 |
try:
|
| 359 |
user_id = int(user_id)
|
| 360 |
N = int(N)
|
| 361 |
|
| 362 |
+
model = MODELS[model_name]
|
| 363 |
+
|
| 364 |
if user_id not in user_item_matrix.index:
|
| 365 |
+
return "User ID not found in system", ""
|
| 366 |
|
| 367 |
predictions = model.predict(user_id)
|
| 368 |
+
|
| 369 |
+
if len(predictions) == 0:
|
| 370 |
+
return "No predictions available for this user", ""
|
| 371 |
+
|
| 372 |
top_n = predictions.sort_values(ascending=False).head(N)
|
| 373 |
|
| 374 |
recommendations = pd.DataFrame({
|
|
|
|
| 377 |
})
|
| 378 |
|
| 379 |
recommendations = recommendations.merge(movies[['movieId', 'title']], on='movieId')
|
| 380 |
+
result_df = recommendations[['movieId', 'title', 'predicted_rating']]
|
| 381 |
+
|
| 382 |
+
# Model performance info
|
| 383 |
+
model_metrics = f"""
|
| 384 |
+
### {model_name} Performance Metrics
|
| 385 |
+
|
| 386 |
+
- **Precision@10**: {metrics[model_name]['Precision@K']:.4f}
|
| 387 |
+
- **Recall@10**: {metrics[model_name]['Recall@K']:.4f}
|
| 388 |
+
- **NDCG@10**: {metrics[model_name]['NDCG@K']:.4f}
|
| 389 |
+
"""
|
| 390 |
+
|
| 391 |
+
return result_df, model_metrics
|
| 392 |
|
| 393 |
except Exception as e:
|
| 394 |
+
return f"Error: {str(e)}", ""
|
| 395 |
+
|
| 396 |
+
def show_comparison():
|
| 397 |
+
"""Display comprehensive model comparison report"""
|
| 398 |
+
|
| 399 |
+
comparison_text = f"""
|
| 400 |
+
# Model Comparison Report
|
| 401 |
+
|
| 402 |
+
## Performance Metrics (Test Set Evaluation)
|
| 403 |
+
|
| 404 |
+
| Model | Precision@10 | Recall@10 | NDCG@10 |
|
| 405 |
+
|-------|--------------|-----------|---------|
|
| 406 |
+
| User-Based CF | {metrics['User-Based CF']['Precision@K']:.4f} | {metrics['User-Based CF']['Recall@K']:.4f} | {metrics['User-Based CF']['NDCG@K']:.4f} |
|
| 407 |
+
| Item-Based CF | {metrics['Item-Based CF']['Precision@K']:.4f} | {metrics['Item-Based CF']['Recall@K']:.4f} | {metrics['Item-Based CF']['NDCG@K']:.4f} |
|
| 408 |
+
| SVD | {metrics['SVD']['Precision@K']:.4f} | {metrics['SVD']['Recall@K']:.4f} | {metrics['SVD']['NDCG@K']:.4f} |
|
| 409 |
+
|
| 410 |
+
---
|
| 411 |
+
|
| 412 |
+
## Best Performing Model: SVD (Matrix Factorization)
|
| 413 |
+
|
| 414 |
+
### Why SVD Outperforms Collaborative Filtering
|
| 415 |
+
|
| 416 |
+
**1. Latent Factor Discovery**
|
| 417 |
+
- SVD decomposes rating matrix into user and item latent factors
|
| 418 |
+
- Captures hidden patterns beyond direct similarity
|
| 419 |
+
- Identifies underlying preferences not visible in raw ratings
|
| 420 |
+
|
| 421 |
+
**2. Sparsity Handling**
|
| 422 |
+
- MovieLens data is extremely sparse (most user-item pairs unrated)
|
| 423 |
+
- SVD learns compressed representation that generalizes well
|
| 424 |
+
- CF methods struggle with cold-start and sparse neighborhoods
|
| 425 |
+
|
| 426 |
+
**3. Computational Efficiency**
|
| 427 |
+
- SVD complexity scales with number of factors (50), not users/items
|
| 428 |
+
- CF requires computing full similarity matrices
|
| 429 |
+
- Prediction time: O(k) for SVD vs O(n) for CF
|
| 430 |
+
|
| 431 |
+
**4. Noise Reduction**
|
| 432 |
+
- Dimensionality reduction filters rating noise
|
| 433 |
+
- Focuses on strongest patterns in data
|
| 434 |
+
- CF can propagate noise through similarity weights
|
| 435 |
+
|
| 436 |
+
### Trade-offs Analysis
|
| 437 |
+
|
| 438 |
+
**User-Based Collaborative Filtering**
|
| 439 |
+
- β Intuitive: "Users like you also liked..."
|
| 440 |
+
- β Explainable recommendations
|
| 441 |
+
- β Computationally expensive (O(nΒ²) similarity matrix)
|
| 442 |
+
- β Poor performance with sparse data
|
| 443 |
+
- β Sensitive to rating scale differences
|
| 444 |
+
|
| 445 |
+
**Item-Based Collaborative Filtering**
|
| 446 |
+
- β More stable than user-based (items change less than users)
|
| 447 |
+
- β Reasonably interpretable
|
| 448 |
+
- β Still requires full item similarity computation
|
| 449 |
+
- β Limited to items similar to already-rated items
|
| 450 |
+
- β Cannot discover cross-genre patterns
|
| 451 |
+
|
| 452 |
+
**SVD (Matrix Factorization)**
|
| 453 |
+
- β Best accuracy across all metrics
|
| 454 |
+
- β Handles sparsity effectively
|
| 455 |
+
- β Discovers latent preference patterns
|
| 456 |
+
- β Scalable to large datasets
|
| 457 |
+
- β Less interpretable (latent factors abstract)
|
| 458 |
+
- β Requires full matrix retraining for updates
|
| 459 |
+
|
| 460 |
+
### Implementation Details
|
| 461 |
+
|
| 462 |
+
- **SVD Configuration**: 50 latent factors
|
| 463 |
+
- **CF Neighborhood Size**: k=50 nearest neighbors
|
| 464 |
+
- **Similarity Metric**: Cosine similarity
|
| 465 |
+
- **Evaluation**: 80/20 train-test split, threshold=4.0 for relevance
|
| 466 |
+
- **Metrics Computation**: Averaged over 100 test users
|
| 467 |
+
|
| 468 |
+
### Conclusion
|
| 469 |
+
|
| 470 |
+
SVD demonstrates superior performance due to its ability to learn compressed latent representations that capture complex user-item interaction patterns. While collaborative filtering methods offer better interpretability, the accuracy gains from matrix factorization make SVD the recommended approach for production deployment.
|
| 471 |
+
"""
|
| 472 |
+
|
| 473 |
+
return comparison_text
|
| 474 |
+
|
| 475 |
+
def get_user_info():
|
| 476 |
+
"""Display available user range"""
|
| 477 |
+
min_user = int(user_item_matrix.index.min())
|
| 478 |
+
max_user = int(user_item_matrix.index.max())
|
| 479 |
+
total_users = len(user_item_matrix.index)
|
| 480 |
+
total_movies = len(movies)
|
| 481 |
+
|
| 482 |
+
info = f"""
|
| 483 |
+
### Dataset Information
|
| 484 |
+
|
| 485 |
+
- **Total Users**: {total_users:,}
|
| 486 |
+
- **Total Movies**: {total_movies:,}
|
| 487 |
+
- **User ID Range**: {min_user} to {max_user}
|
| 488 |
+
- **Rating Scale**: 1-5 stars
|
| 489 |
+
- **Dataset**: MovieLens
|
| 490 |
+
"""
|
| 491 |
+
return info
|
| 492 |
+
|
| 493 |
+
# Gradio Interface
|
| 494 |
+
with gr.Blocks(title="MovieLens Recommendation System - DataSynthis_ML_JobTask", theme=gr.themes.Soft()) as demo:
|
| 495 |
+
|
| 496 |
+
gr.Markdown("""
|
| 497 |
+
# π¬ MovieLens Recommendation System
|
| 498 |
+
## DataSynthis_ML_JobTask
|
| 499 |
+
|
| 500 |
+
Advanced movie recommendation engine using Collaborative Filtering and Matrix Factorization techniques.
|
| 501 |
+
""")
|
| 502 |
+
|
| 503 |
+
with gr.Tab("π― Get Recommendations"):
|
| 504 |
+
gr.Markdown(get_user_info())
|
| 505 |
+
|
| 506 |
+
with gr.Row():
|
| 507 |
+
with gr.Column():
|
| 508 |
+
user_input = gr.Number(label="User ID", value=1, precision=0)
|
| 509 |
+
n_input = gr.Number(label="Number of Recommendations (N)", value=10, precision=0)
|
| 510 |
+
model_input = gr.Dropdown(
|
| 511 |
+
choices=['User-Based CF', 'Item-Based CF', 'SVD'],
|
| 512 |
+
value='SVD',
|
| 513 |
+
label="Select Recommendation Model"
|
| 514 |
+
)
|
| 515 |
+
recommend_btn = gr.Button("π¬ Get Recommendations", variant="primary")
|
| 516 |
+
|
| 517 |
+
output_df = gr.Dataframe(label="π Recommended Movies", wrap=True)
|
| 518 |
+
metrics_output = gr.Markdown(label="π Model Performance")
|
| 519 |
+
|
| 520 |
+
recommend_btn.click(
|
| 521 |
+
fn=recommend_movies,
|
| 522 |
+
inputs=[user_input, n_input, model_input],
|
| 523 |
+
outputs=[output_df, metrics_output]
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
+
with gr.Tab("π Model Comparison"):
|
| 527 |
+
comparison_output = gr.Markdown(show_comparison())
|
| 528 |
+
|
| 529 |
+
with gr.Tab("βΉοΈ About"):
|
| 530 |
+
gr.Markdown("""
|
| 531 |
+
## Implementation Overview
|
| 532 |
+
|
| 533 |
+
### Algorithms Implemented
|
| 534 |
+
|
| 535 |
+
**1. User-Based Collaborative Filtering**
|
| 536 |
+
- Computes cosine similarity between users
|
| 537 |
+
- Recommends items liked by similar users
|
| 538 |
+
- Neighborhood size: 50 users
|
| 539 |
+
|
| 540 |
+
**2. Item-Based Collaborative Filtering**
|
| 541 |
+
- Computes cosine similarity between items
|
| 542 |
+
- Recommends items similar to user's rated items
|
| 543 |
+
- Neighborhood size: 50 items
|
| 544 |
+
|
| 545 |
+
**3. Singular Value Decomposition (SVD)**
|
| 546 |
+
- Matrix factorization with 50 latent factors
|
| 547 |
+
- Learns user and item embeddings
|
| 548 |
+
- Predicts ratings via dot product
|
| 549 |
+
|
| 550 |
+
### Evaluation Metrics
|
| 551 |
+
|
| 552 |
+
- **Precision@K**: Proportion of recommended items that are relevant
|
| 553 |
+
- **Recall@K**: Proportion of relevant items that are recommended
|
| 554 |
+
- **NDCG@K**: Normalized discounted cumulative gain (position-aware metric)
|
| 555 |
+
|
| 556 |
+
### Dataset
|
| 557 |
+
- Source: MovieLens
|
| 558 |
+
- Train/Test Split: 80/20
|
| 559 |
+
- Relevance Threshold: 4.0 stars
|
| 560 |
+
|
| 561 |
+
### Technologies
|
| 562 |
+
- Python, NumPy, Pandas, SciPy
|
| 563 |
+
- Scikit-learn for similarity computation
|
| 564 |
+
- Gradio for web interface
|
| 565 |
+
|
| 566 |
+
---
|
| 567 |
+
|
| 568 |
+
**Developed for DataSynthis ML Job Task**
|
| 569 |
+
""")
|
| 570 |
+
|
| 571 |
+
demo.launch()
|