Spaces:

LvMAC
/

DataSynthis_ML_JobTask

Sleeping

App Files Files Community

LvMAC commited on Sep 30, 2025

Commit

6a8179a

verified ·

1 Parent(s): 863f720

Update app.py

Browse files

Files changed (1) hide show

app.py +418 -667

app.py CHANGED Viewed

@@ -1,725 +1,476 @@
-# ============================================================================
-# MOVIELENS RECOMMENDATION SYSTEM - PURE IMPLEMENTATION
-# ============================================================================
-import numpy as np
 import pandas as pd
-from scipy.sparse.linalg import svds
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.model_selection import train_test_split
 import pickle
 import os
-import warnings
-warnings.filterwarnings('ignore')
-# ============================================================================
-# DATA LOADING & PREPROCESSING
-# ============================================================================
-def load_movielens_data(ratings_path='ratings.csv', movies_path='movies.csv'):
-    """Load MovieLens data"""
-    ratings = pd.read_csv(ratings_path)
-    movies = pd.read_csv(movies_path)
-    print(f"Loaded {len(ratings)} ratings")
-    print(f"Loaded {len(movies)} movies")
-    print(f"Users: {ratings['userId'].nunique()}")
-    print(f"Rating distribution:\n{ratings['rating'].value_counts().sort_index()}")
-    print(f"Mean rating: {ratings['rating'].mean():.3f}")
-    print(f"Median rating: {ratings['rating'].median():.3f}")
-    return ratings, movies
-def create_user_item_matrix(ratings):
-    """Create user-item rating matrix"""
-    user_item_matrix = ratings.pivot_table(
-        index='userId',
-        columns='movieId',
-        values='rating'
-    ).fillna(0)
-    sparsity = 100 * (1 - (user_item_matrix > 0).sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]))
-    print(f"Matrix shape: {user_item_matrix.shape}")
-    print(f"Sparsity: {sparsity:.2f}%")
-    return user_item_matrix
-# ============================================================================
-# USER-BASED COLLABORATIVE FILTERING
-# ============================================================================
-class UserBasedCF:
-    """User-based collaborative filtering using cosine similarity"""
-    def __init__(self, user_item_matrix):
-        self.matrix = user_item_matrix
-        self.user_similarity = None
-    def fit(self):
-        """Compute user-user similarity matrix"""
-        print("Computing user similarity matrix...")
-        self.user_similarity = cosine_similarity(self.matrix)
-        np.fill_diagonal(self.user_similarity, 0)
-        print("User similarity matrix computed")
-    def predict(self, user_id, k=50):
-        """Predict ratings for a user based on similar users"""
-        if user_id not in self.matrix.index:
-            return pd.Series(dtype=float)
-        user_idx = self.matrix.index.get_loc(user_id)
-        user_similarities = self.user_similarity[user_idx]
-        # Get top-k similar users
-        top_k_indices = np.argsort(user_similarities)[::-1][:k]
-        top_k_similarities = user_similarities[top_k_indices]
-        # Filter out negative similarities
-        positive_mask = top_k_similarities > 0
-        top_k_indices = top_k_indices[positive_mask]
-        top_k_similarities = top_k_similarities[positive_mask]
-        if len(top_k_indices) == 0:
-            return pd.Series(0, index=self.matrix.columns, dtype=float)
-        # Get ratings from similar users
-        similar_users_ratings = self.matrix.iloc[top_k_indices]
-        # Weighted sum of ratings
-        weighted_ratings = similar_users_ratings.T.dot(top_k_similarities)
-        sum_of_weights = np.sum(top_k_similarities)
-        # Calculate predicted ratings
-        predicted_ratings = weighted_ratings / (sum_of_weights + 1e-10)
-        # Exclude already rated items
-        user_ratings = self.matrix.loc[user_id]
-        predicted_ratings[user_ratings > 0] = 0
-        return predicted_ratings
-# ============================================================================
-# ITEM-BASED COLLABORATIVE FILTERING
-# ============================================================================
-class ItemBasedCF:
-    """Item-based collaborative filtering using cosine similarity"""
-    def __init__(self, user_item_matrix):
-        self.matrix = user_item_matrix
-        self.item_similarity = None
-    def fit(self):
-        """Compute item-item similarity matrix"""
-        print("Computing item similarity matrix...")
-        self.item_similarity = cosine_similarity(self.matrix.T)
-        np.fill_diagonal(self.item_similarity, 0)
-        print("Item similarity matrix computed")
-    def predict(self, user_id, k=50):
-        """Predict ratings for a user based on similar items"""
-        if user_id not in self.matrix.index:
-            return pd.Series(dtype=float)
-        user_ratings = self.matrix.loc[user_id]
-        rated_items = user_ratings[user_ratings > 0]
-        if len(rated_items) == 0:
-            return pd.Series(0, index=self.matrix.columns, dtype=float)
-        predicted_ratings = pd.Series(0.0, index=self.matrix.columns)
-        for item_id, rating in rated_items.items():
-            item_idx = self.matrix.columns.get_loc(item_id)
-            item_similarities = self.item_similarity[item_idx]
-            # Get top-k similar items
-            top_k_indices = np.argsort(item_similarities)[::-1][:k]
-            for similar_idx in top_k_indices:
-                similar_item_id = self.matrix.columns[similar_idx]
-                similarity = item_similarities[similar_idx]
-                if similarity > 0 and user_ratings[similar_item_id] == 0:
-                    predicted_ratings[similar_item_id] += similarity * rating
-        # Exclude already rated items
-        predicted_ratings[user_ratings > 0] = 0
-        return predicted_ratings
-# ============================================================================
-# SINGULAR VALUE DECOMPOSITION (SVD)
-# ============================================================================
-class SVDRecommender:
-    """Matrix factorization using SVD"""
-    def __init__(self, user_item_matrix, n_factors=50):
-        self.matrix = user_item_matrix
-        self.n_factors = n_factors
-        self.predictions = None
-    def fit(self):
-        """Perform SVD decomposition"""
-        print(f"Performing SVD with {self.n_factors} factors...")
-        # Mean center the matrix
-        matrix_mean = np.mean(self.matrix.values[np.where(self.matrix.values != 0)])
-        matrix_centered = self.matrix.values.copy()
-        matrix_centered[matrix_centered != 0] -= matrix_mean
-        # Perform SVD
-        U, sigma, Vt = svds(matrix_centered, k=self.n_factors)
-        sigma = np.diag(sigma)
-        # Reconstruct the matrix
-        predicted_ratings = np.dot(np.dot(U, sigma), Vt) + matrix_mean
-        self.predictions = pd.DataFrame(
-            predicted_ratings,
-            index=self.matrix.index,
-            columns=self.matrix.columns
-        )
-        print("SVD decomposition complete")
-    def predict(self, user_id):
-        """Get predicted ratings for a user"""
-        if user_id not in self.predictions.index:
-            return pd.Series(dtype=float)
-        user_predictions = self.predictions.loc[user_id].copy()
-        user_ratings = self.matrix.loc[user_id]
-        # Exclude already rated items
-        user_predictions[user_ratings > 0] = 0
-        return user_predictions
-# ============================================================================
-# EVALUATION METRICS
-# ============================================================================
-def precision_at_k(recommended, relevant, k):
-    """Precision@K: fraction of recommended items that are relevant"""
-    recommended_k = set(recommended[:k])
-    relevant_set = set(relevant)
-    if k == 0:
-        return 0.0
-    return len(recommended_k & relevant_set) / k
-def recall_at_k(recommended, relevant, k):
-    """Recall@K: fraction of relevant items that are recommended"""
-    recommended_k = set(recommended[:k])
-    relevant_set = set(relevant)
-    if len(relevant_set) == 0:
-        return 0.0
-    return len(recommended_k & relevant_set) / len(relevant_set)
-def ndcg_at_k(recommended, relevant, k):
-    """NDCG@K: Normalized Discounted Cumulative Gain"""
-    dcg = 0.0
-    for i, item in enumerate(recommended[:k]):
-        if item in relevant:
-            dcg += 1.0 / np.log2(i + 2)
-    idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(relevant), k))])
-    if idcg == 0:
-        return 0.0
-    return dcg / idcg
-def evaluate_model(model, test_data, user_item_matrix, k=10, threshold=4.0):
-    """Evaluate recommendation model"""
-    precisions = []
-    recalls = []
-    ndcgs = []
-    test_users = test_data['userId'].unique()
-    print(f"Evaluating on {len(test_users)} test users...")
-    evaluated_count = 0
-    for user_id in test_users:
-        if user_id not in user_item_matrix.index:
-            continue
-        # Get relevant items for this user (rated >= threshold)
-        user_test_data = test_data[test_data['userId'] == user_id]
-        relevant_items = user_test_data[user_test_data['rating'] >= threshold]['movieId'].tolist()
-        if len(relevant_items) == 0:
-            continue
-        # Get predictions
-        predictions = model.predict(user_id)
-        if len(predictions) == 0 or predictions.sum() == 0:
-            continue
-        # Get top-k recommendations
-        top_k_items = predictions.nlargest(k).index.tolist()
-        # Calculate metrics
-        precisions.append(precision_at_k(top_k_items, relevant_items, k))
-        recalls.append(recall_at_k(top_k_items, relevant_items, k))
-        ndcgs.append(ndcg_at_k(top_k_items, relevant_items, k))
-        evaluated_count += 1
-        if evaluated_count >= 100:  # Limit for computational efficiency
-            break
-    print(f"Evaluated {evaluated_count} users")
-    if len(precisions) == 0:
-        return {
-            'Precision@K': 0.0,
-            'Recall@K': 0.0,
-            'NDCG@K': 0.0
         }
-    return {
-        'Precision@K': np.mean(precisions),
-        'Recall@K': np.mean(recalls),
-        'NDCG@K': np.mean(ndcgs)
-    }
-# ============================================================================
-# RECOMMENDATION FUNCTION
-# ============================================================================
-def recommend_movies(user_id, N, model, movies_df):
-    """
-    Recommend top N movies for a user
-    Parameters:
-    - user_id: User ID
-    - N: Number of recommendations
-    - model: Trained recommendation model
-    - movies_df: DataFrame with movie information
-    Returns:
-    - DataFrame with recommended movies
-    """
-    predictions = model.predict(user_id)
-    if len(predictions) == 0:
-        return pd.DataFrame(columns=['movieId', 'title', 'predicted_rating'])
-    # Get top N predictions
-    top_n = predictions.nlargest(N)
-    recommendations = pd.DataFrame({
-        'movieId': top_n.index,
-        'predicted_rating': top_n.values
-    })
-    # Merge with movie titles
-    recommendations = recommendations.merge(
-        movies_df[['movieId', 'title']],
-        on='movieId',
-        how='left'
-    )
-    return recommendations[['movieId', 'title', 'predicted_rating']]
-# ============================================================================
-# MAIN EXECUTION
-# ============================================================================
-def main():
-    print("="*70)
-    print("MOVIELENS RECOMMENDATION SYSTEM")
-    print("="*70)
-    # Load data
-    print("\n[1/6] Loading data...")
-    ratings, movies = load_movielens_data()
-    # Split data
-    print("\n[2/6] Splitting data (80% train, 20% test)...")
-    train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
-    print(f"Training set: {len(train_data)} ratings")
-    print(f"Test set: {len(test_data)} ratings")
-    # Create user-item matrix
-    print("\n[3/6] Creating user-item matrix...")
-    user_item_matrix = create_user_item_matrix(train_data)
-    # Train User-Based CF
-    print("\n[4/6] Training User-Based Collaborative Filtering...")
-    user_cf = UserBasedCF(user_item_matrix)
-    user_cf.fit()
-    print("Evaluating User-Based CF...")
-    metrics_user_cf = evaluate_model(user_cf, test_data, user_item_matrix)
-    print(f"User-Based CF Results:")
-    for metric, value in metrics_user_cf.items():
-        print(f"  {metric}: {value:.4f}")
-    # Train Item-Based CF
-    print("\n[5/6] Training Item-Based Collaborative Filtering...")
-    item_cf = ItemBasedCF(user_item_matrix)
-    item_cf.fit()
-    print("Evaluating Item-Based CF...")
-    metrics_item_cf = evaluate_model(item_cf, test_data, user_item_matrix)
-    print(f"Item-Based CF Results:")
-    for metric, value in metrics_item_cf.items():
-        print(f"  {metric}: {value:.4f}")
-    # Train SVD
-    print("\n[6/6] Training SVD (Matrix Factorization)...")
-    svd = SVDRecommender(user_item_matrix, n_factors=50)
-    svd.fit()
-    print("Evaluating SVD...")
-    metrics_svd = evaluate_model(svd, test_data, user_item_matrix)
-    print(f"SVD Results:")
-    for metric, value in metrics_svd.items():
-        print(f"  {metric}: {value:.4f}")
-    # Model comparison
-    print("\n" + "="*70)
-    print("MODEL COMPARISON")
-    print("="*70)
-    comparison_df = pd.DataFrame({
-        'User-Based CF': metrics_user_cf,
-        'Item-Based CF': metrics_item_cf,
-        'SVD': metrics_svd
-    })
-    print(comparison_df.to_string())
-    # Determine best model
-    best_model_name = comparison_df.loc['NDCG@K'].idxmax()
-    print(f"\n*** Best Model (by NDCG@K): {best_model_name} ***")
-    if best_model_name == 'User-Based CF':
-        best_model = user_cf
-    elif best_model_name == 'Item-Based CF':
-        best_model = item_cf
-    else:
-        best_model = svd
-    # Example recommendations
-    print("\n" + "="*70)
-    print("EXAMPLE RECOMMENDATIONS")
-    print("="*70)
-    sample_user_id = user_item_matrix.index[0]
-    print(f"\nTop 10 recommendations for User {sample_user_id} using {best_model_name}:")
-    recommendations = recommend_movies(sample_user_id, 10, best_model, movies)
-    print(recommendations.to_string(index=False))
-    # Save models for deployment
-    print("\n" + "="*70)
-    print("SAVING MODELS FOR DEPLOYMENT")
-    print("="*70)
-    save_models_for_deployment(
-        user_cf, item_cf, svd,
-        user_item_matrix, movies,
-        metrics_user_cf, metrics_item_cf, metrics_svd
-    )
-    return best_model, user_item_matrix, movies
-def save_models_for_deployment(user_cf, item_cf, svd, user_item_matrix, movies,
-                               metrics_user_cf, metrics_item_cf, metrics_svd):
-    """Save all models and data for Hugging Face deployment"""
-    output_dir = 'deployment_files'
-    os.makedirs(output_dir, exist_ok=True)
-    print(f"Saving models to {output_dir}/...")
-    with open(f'{output_dir}/user_cf_model.pkl', 'wb') as f:
-        pickle.dump(user_cf, f)
-    print("  ✓ User-Based CF model saved")
-    with open(f'{output_dir}/item_cf_model.pkl', 'wb') as f:
-        pickle.dump(item_cf, f)
-    print("  ✓ Item-Based CF model saved")
-    with open(f'{output_dir}/svd_model.pkl', 'wb') as f:
-        pickle.dump(svd, f)
-    print("  ✓ SVD model saved")
-    with open(f'{output_dir}/user_item_matrix.pkl', 'wb') as f:
-        pickle.dump(user_item_matrix, f)
-    print("  ✓ User-item matrix saved")
-    metrics = {
-        'User-Based CF': metrics_user_cf,
-        'Item-Based CF': metrics_item_cf,
-        'SVD': metrics_svd
-    }
-    with open(f'{output_dir}/metrics.pkl', 'wb') as f:
-        pickle.dump(metrics, f)
-    print("  ✓ Metrics saved")
-    movies.to_csv(f'{output_dir}/movies.csv', index=False)
-    print("  ✓ Movies data saved")
-    print("\nAll files ready for Hugging Face deployment!")
-if __name__ == "__main__":
-    best_model, user_item_matrix, movies = main()
-import gradio as gr
-import pickle
-import pandas as pd
-import numpy as np
-import os
-# Determine file location
-BASE_DIR = 'deployment_files' if os.path.exists('deployment_files') else '.'
-# Load models and data
-print("Loading models...")
-with open(f'{BASE_DIR}/user_cf_model.pkl', 'rb') as f:
-    user_cf = pickle.load(f)
-with open(f'{BASE_DIR}/item_cf_model.pkl', 'rb') as f:
-    item_cf = pickle.load(f)
-with open(f'{BASE_DIR}/svd_model.pkl', 'rb') as f:
-    svd = pickle.load(f)
-with open(f'{BASE_DIR}/user_item_matrix.pkl', 'rb') as f:
-    user_item_matrix = pickle.load(f)
-movies = pd.read_csv(f'{BASE_DIR}/movies.csv')
-with open(f'{BASE_DIR}/metrics.pkl', 'rb') as f:
-    metrics = pickle.load(f)
-MODELS = {
-    'User-Based CF': user_cf,
-    'Item-Based CF': item_cf,
-    'SVD': svd
-}
-print("Models loaded successfully!")
-def recommend_movies(user_id, N, model_name='SVD'):
-    """Generate movie recommendations"""
     try:
         user_id = int(user_id)
-        N = int(N)
-        if user_id not in user_item_matrix.index:
-            return pd.DataFrame({'Error': ['User ID not found in system']}), ""
-        model = MODELS[model_name]
-        predictions = model.predict(user_id)
-        if len(predictions) == 0 or predictions.sum() == 0:
-            return pd.DataFrame({'Error': ['No predictions available for this user']}), ""
-        # Get top N recommendations
-        top_n = predictions.nlargest(N)
-        recommendations = pd.DataFrame({
-            'movieId': top_n.index,
-            'predicted_rating': top_n.values
-        })
-        # Add movie titles
-        recommendations = recommendations.merge(
-            movies[['movieId', 'title']],
-            on='movieId',
-            how='left'
-        )
-        result = recommendations[['movieId', 'title', 'predicted_rating']]
-        # Format metrics
-        metrics_text = f"""
-### {model_name} Performance Metrics
-- **Precision@10**: {metrics[model_name]['Precision@K']:.4f}
-- **Recall@10**: {metrics[model_name]['Recall@K']:.4f}
-- **NDCG@10**: {metrics[model_name]['NDCG@K']:.4f}
-*Metrics evaluated on test set with relevance threshold = 4.0*
-        """
-        return result, metrics_text
     except Exception as e:
-        return pd.DataFrame({'Error': [f'Error: {str(e)}']}), ""
-def show_model_comparison():
-    """Display model comparison report"""
-    # Determine best model
-    ndcg_scores = {name: m['NDCG@K'] for name, m in metrics.items()}
-    best_model = max(ndcg_scores, key=ndcg_scores.get)
-    report = f"""
-# Model Comparison Report
-## Performance Metrics
-| Model | Precision@10 | Recall@10 | NDCG@10 |
-|-------|--------------|-----------|---------|
-| User-Based CF | {metrics['User-Based CF']['Precision@K']:.4f} | {metrics['User-Based CF']['Recall@K']:.4f} | {metrics['User-Based CF']['NDCG@K']:.4f} |
-| Item-Based CF | {metrics['Item-Based CF']['Precision@K']:.4f} | {metrics['Item-Based CF']['Recall@K']:.4f} | {metrics['Item-Based CF']['NDCG@K']:.4f} |
-| SVD | {metrics['SVD']['Precision@K']:.4f} | {metrics['SVD']['Recall@K']:.4f} | {metrics['SVD']['NDCG@K']:.4f} |
-## Best Model: {best_model}
-### Why {best_model} Performs Best
-**Matrix Factorization (SVD) Advantages:**
-- Captures latent factors in user-movie interactions
-- Handles sparse data through dimensionality reduction
-- Generalizes better than similarity-based methods
-- Computationally efficient for prediction
-**Collaborative Filtering Trade-offs:**
-- **User-Based**: Intuitive but computationally expensive, struggles with sparsity
-- **Item-Based**: More stable than user-based, but limited to similar items
-- **SVD**: Best balance of accuracy and efficiency
-### Implementation Details
-- **SVD**: 50 latent factors via Singular Value Decomposition
-- **CF**: Cosine similarity with k=50 neighbors
-- **Evaluation**: 80/20 train-test split, threshold=4.0 for relevance
-- **Metrics**: Precision, Recall, and NDCG at K=10
-### Conclusion
-SVD achieves the best performance by learning compressed representations of user preferences
-and movie characteristics, making it the recommended approach for production deployment.
-    """
-    return report
-def get_dataset_info():
-    """Display dataset statistics"""
-    min_user = int(user_item_matrix.index.min())
-    max_user = int(user_item_matrix.index.max())
-    num_users = len(user_item_matrix.index)
-    num_movies = len(movies)
-    info = f"""
-### Dataset Information
-- **Total Users**: {num_users:,}
-- **Total Movies**: {num_movies:,}
-- **User ID Range**: {min_user} to {max_user}
-- **Rating Scale**: 0.5 to 5.0 stars
-- **Source**: MovieLens Dataset
-    """
-    return info
-# Build Gradio Interface
-with gr.Blocks(title="MovieLens Recommendation System", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🎬 MovieLens Recommendation System
-    ## DataSynthis_ML_JobTask
-    Compare three recommendation algorithms: User-Based CF, Item-Based CF, and SVD Matrix Factorization
-    """)
-    with gr.Tab("🎯 Get Recommendations"):
-        gr.Markdown(get_dataset_info())
         with gr.Row():
             with gr.Column():
-                user_id_input = gr.Number(
-                    label="User ID",
-                    value=1,
-                    precision=0,
-                    info="Enter a valid user ID from the dataset"
                 )
-                n_input = gr.Number(
-                    label="Number of Recommendations (N)",
-                    value=10,
-                    precision=0,
-                    info="How many movies to recommend (1-20)"
                 )
-                model_select = gr.Dropdown(
-                    choices=['User-Based CF', 'Item-Based CF', 'SVD'],
                     value='SVD',
-                    label="Recommendation Algorithm",
-                    info="Select which model to use"
                 )
-                recommend_btn = gr.Button("🎬 Get Recommendations", variant="primary", size="lg")
-        recommendations_output = gr.Dataframe(
-            label="📋 Recommended Movies",
-            wrap=True
-        )
-        metrics_output = gr.Markdown(label="📊 Model Performance")
         recommend_btn.click(
-            fn=recommend_movies,
-            inputs=[user_id_input, n_input, model_select],
-            outputs=[recommendations_output, metrics_output]
         )
-    with gr.Tab("📊 Model Comparison"):
-        gr.Markdown(show_model_comparison())
-    with gr.Tab("ℹ️ Documentation"):
         gr.Markdown("""
-        ## Implementation Overview
-        ### Algorithms
-        **1. User-Based Collaborative Filtering**
-        - Finds users with similar rating patterns
-        - Recommends items liked by similar users
-        - Uses cosine similarity with k=50 neighbors
-        **2. Item-Based Collaborative Filtering**
-        - Finds items similar to those the user has rated
-        - Recommends items similar to user's preferences
-        - Uses cosine similarity with k=50 neighbors
-        **3. Singular Value Decomposition (SVD)**
-        - Matrix factorization with 50 latent factors
-        - Learns low-dimensional representations of users and items
-        - Predicts ratings via reconstructed matrix
-        ### Evaluation Metrics
-        - **Precision@K**: Fraction of recommended items that are relevant
-        - **Recall@K**: Fraction of relevant items that are recommended
-        - **NDCG@K**: Normalized Discounted Cumulative Gain (considers ranking order)
-        ### Technical Stack
-        - Python 3.10+
-        - NumPy, Pandas for data processing
-        - SciPy for SVD computation
-        - Scikit-learn for similarity metrics
-        - Gradio for web interface
-        ### Dataset
-        - Source: MovieLens
-        - Split: 80% training, 20% testing
-        - Relevance Threshold: 4.0 stars
-        ---
-        **Project**: DataSynthis ML Job Task
-        **Task**: Movie Recommendation System
         """)
 demo.launch()

 import pandas as pd
+import numpy as np
+from surprise import SVD, SVDpp, NMF, KNNBasic, Dataset, Reader
+from surprise.model_selection import train_test_split, GridSearchCV
+from collections import defaultdict
+import gradio as gr
 import pickle
 import os
+class MovieRecommenderEnsemble:
+    def __init__(self, ratings_path, movies_path):
+        print("Loading data...")
+        self.ratings = pd.read_csv(ratings_path)
+        self.movies = pd.read_csv(movies_path)
+        # Prepare Surprise dataset
+        reader = Reader(rating_scale=(0.5, 5.0))
+        self.data = Dataset.load_from_df(
+            self.ratings[['userId', 'movieId', 'rating']],
+            reader
+        )
+        # Train-test split for evaluation
+        self.trainset, self.testset = train_test_split(self.data, test_size=0.2)
+        # Initialize models
+        self.models = {}
+        self.train_all_models()
+    def train_all_models(self):
+        """Train all models with optimal hyperparameters for MovieLens 1M"""
+        print("\n" + "="*50)
+        print("Training User-Based Collaborative Filtering...")
+        print("="*50)
+        # User-Based CF - Optimal for 1M dataset
+        user_based_options = {
+            'name': 'cosine',
+            'user_based': True,
+            'min_support': 5
+        }
+        self.models['user_based_cf'] = KNNBasic(
+            k=50,
+            sim_options=user_based_options
+        )
+        self.models['user_based_cf'].fit(self.trainset)
+        print("✓ User-Based CF trained")
+        print("\n" + "="*50)
+        print("Training Item-Based Collaborative Filtering...")
+        print("="*50)
+        # Item-Based CF - Optimal for 1M dataset
+        item_based_options = {
+            'name': 'cosine',
+            'user_based': False,
+            'min_support': 5
+        }
+        self.models['item_based_cf'] = KNNBasic(
+            k=40,
+            sim_options=item_based_options
+        )
+        self.models['item_based_cf'].fit(self.trainset)
+        print("✓ Item-Based CF trained")
+        print("\n" + "="*50)
+        print("Training SVD (Matrix Factorization)...")
+        print("="*50)
+        # SVD - Tuned for 1M dataset
+        self.models['svd'] = SVD(
+            n_factors=150,
+            n_epochs=30,
+            lr_all=0.007,
+            reg_all=0.05,
+            random_state=42,
+            verbose=True
+        )
+        self.models['svd'].fit(self.trainset)
+        print("✓ SVD trained")
+        print("\n" + "="*50)
+        print("Training SVD++ (Enhanced Matrix Factorization)...")
+        print("="*50)
+        # SVD++ - Includes implicit feedback
+        self.models['svdpp'] = SVDpp(
+            n_factors=100,
+            n_epochs=20,
+            lr_all=0.007,
+            reg_all=0.05,
+            random_state=42,
+            verbose=True
+        )
+        self.models['svdpp'].fit(self.trainset)
+        print("✓ SVD++ trained")
+        print("\n" + "="*50)
+        print("Training NMF (Non-negative Matrix Factorization)...")
+        print("="*50)
+        # NMF - Alternative factorization
+        self.models['nmf'] = NMF(
+            n_factors=50,
+            n_epochs=50,
+            random_state=42,
+            verbose=True
+        )
+        self.models['nmf'].fit(self.trainset)
+        print("✓ NMF trained")
+        print("\n" + "="*50)
+        print("All models trained successfully!")
+        print("="*50)
+    def evaluate_models(self):
+        """Evaluate all models on test set"""
+        print("\n" + "="*50)
+        print("EVALUATING ALL MODELS")
+        print("="*50)
+        results = {}
+        for name, model in self.models.items():
+            print(f"\nEvaluating {name.upper()}...")
+            # Get predictions
+            predictions = model.test(self.testset)
+            # Calculate RMSE and MAE
+            rmse = self.calculate_rmse(predictions)
+            mae = self.calculate_mae(predictions)
+            # Calculate Precision@10, Recall@10, NDCG@10
+            precision, recall, ndcg = self.calculate_ranking_metrics(predictions, k=10)
+            results[name] = {
+                'RMSE': rmse,
+                'MAE': mae,
+                'Precision@10': precision,
+                'Recall@10': recall,
+                'NDCG@10': ndcg
+            }
+            print(f"  RMSE: {rmse:.4f}")
+            print(f"  MAE: {mae:.4f}")
+            print(f"  Precision@10: {precision:.4f}")
+            print(f"  Recall@10: {recall:.4f}")
+            print(f"  NDCG@10: {ndcg:.4f}")
+        # Determine best model
+        best_model = max(results.items(), key=lambda x: x[1]['Precision@10'])
+        print(f"\n{'='*50}")
+        print(f"BEST MODEL: {best_model[0].upper()}")
+        print(f"Precision@10: {best_model[1]['Precision@10']:.4f}")
+        print(f"{'='*50}\n")
+        return results, best_model[0]
+    def calculate_rmse(self, predictions):
+        """Calculate Root Mean Square Error"""
+        mse = np.mean([(pred.est - pred.r_ui)**2 for pred in predictions])
+        return np.sqrt(mse)
+    def calculate_mae(self, predictions):
+        """Calculate Mean Absolute Error"""
+        return np.mean([abs(pred.est - pred.r_ui) for pred in predictions])
+    def calculate_ranking_metrics(self, predictions, k=10, threshold=4.0):
+        """Calculate Precision@K, Recall@K, and NDCG@K"""
+        # Organize predictions by user
+        user_est_true = defaultdict(list)
+        for uid, _, true_r, est, _ in predictions:
+            user_est_true[uid].append((est, true_r))
+        precisions = []
+        recalls = []
+        ndcgs = []
+        for uid, user_ratings in user_est_true.items():
+            # Sort by estimated rating
+            user_ratings.sort(key=lambda x: x[0], reverse=True)
+            # Top k predictions
+            top_k = user_ratings[:k]
+            # Calculate metrics
+            n_rel = sum(1 for (_, true_r) in user_ratings if true_r >= threshold)
+            n_rec_k = sum(1 for (est, _) in top_k if est >= threshold)
+            n_rel_and_rec_k = sum(1 for (est, true_r) in top_k
+                                  if true_r >= threshold and est >= threshold)
+            # Precision@K
+            precision = n_rel_and_rec_k / k if k > 0 else 0
+            precisions.append(precision)
+            # Recall@K
+            recall = n_rel_and_rec_k / n_rel if n_rel > 0 else 0
+            recalls.append(recall)
+            # NDCG@K
+            dcg = sum((2**true_r - 1) / np.log2(i + 2)
+                     for i, (est, true_r) in enumerate(top_k) if true_r >= threshold)
+            ideal_ratings = sorted([true_r for _, true_r in user_ratings], reverse=True)[:k]
+            idcg = sum((2**true_r - 1) / np.log2(i + 2)
+                      for i, true_r in enumerate(ideal_ratings) if true_r >= threshold)
+            ndcg = dcg / idcg if idcg > 0 else 0
+            ndcgs.append(ndcg)
+        return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)
+    def recommend_movies(self, user_id, N, model_name='svd'):
+        """
+        Recommend top N movies for a user using specified model
+        Args:
+            user_id: User ID
+            N: Number of recommendations
+            model_name: 'user_based_cf', 'item_based_cf', 'svd', 'svdpp', 'nmf', or 'ensemble'
+        """
+        if model_name == 'ensemble':
+            return self.recommend_ensemble(user_id, N)
+        if model_name not in self.models:
+            return f"Model '{model_name}' not found. Available: {list(self.models.keys())}"
+        model = self.models[model_name]
+        # Get all movies
+        all_movies = self.movies['movieId'].unique()
+        # Get movies user has rated
+        rated_movies = self.ratings[self.ratings['userId'] == user_id]['movieId'].values
+        # Get unrated movies
+        unrated_movies = [m for m in all_movies if m not in rated_movies]
+        # Predict ratings
+        predictions = []
+        for movie_id in unrated_movies:
+            pred = model.predict(user_id, movie_id)
+            predictions.append((movie_id, pred.est))
+        # Sort by predicted rating
+        predictions.sort(key=lambda x: x[1], reverse=True)
+        # Get top N
+        top_n = predictions[:N]
+        # Format results
+        results = []
+        for i, (movie_id, score) in enumerate(top_n, 1):
+            movie_info = self.movies[self.movies['movieId'] == movie_id]
+            if len(movie_info) > 0:
+                title = movie_info['title'].iloc[0]
+                genres = movie_info['genres'].iloc[0] if 'genres' in movie_info else 'N/A'
+                results.append({
+                    'rank': i,
+                    'movieId': int(movie_id),
+                    'title': title,
+                    'genres': genres,
+                    'predicted_rating': round(score, 2)
+                })
+        return results
+    def recommend_ensemble(self, user_id, N):
+        """Ensemble recommendation using weighted average of all models"""
+        # Get all movies
+        all_movies = self.movies['movieId'].unique()
+        rated_movies = self.ratings[self.ratings['userId'] == user_id]['movieId'].values
+        unrated_movies = [m for m in all_movies if m not in rated_movies]
+        # Model weights (based on typical performance)
+        weights = {
+            'user_based_cf': 0.20,
+            'item_based_cf': 0.20,
+            'svd': 0.25,
+            'svdpp': 0.25,
+            'nmf': 0.10
         }
+        # Aggregate predictions
+        movie_scores = defaultdict(float)
+        for movie_id in unrated_movies:
+            weighted_sum = 0
+            for model_name, model in self.models.items():
+                pred = model.predict(user_id, movie_id).est
+                weighted_sum += pred * weights[model_name]
+            movie_scores[movie_id] = weighted_sum
+        # Sort and get top N
+        sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)[:N]
+        # Format results
+        results = []
+        for i, (movie_id, score) in enumerate(sorted_movies, 1):
+            movie_info = self.movies[self.movies['movieId'] == movie_id]
+            if len(movie_info) > 0:
+                title = movie_info['title'].iloc[0]
+                genres = movie_info['genres'].iloc[0] if 'genres' in movie_info else 'N/A'
+                results.append({
+                    'rank': i,
+                    'movieId': int(movie_id),
+                    'title': title,
+                    'genres': genres,
+                    'predicted_rating': round(score, 2)
+                })
+        return results
+# Initialize recommender system
+print("Initializing MovieLens Recommendation System...")
+recommender = MovieRecommenderEnsemble('ratings.csv', 'movies.csv')
+# Evaluate all models
+evaluation_results, best_model_name = recommender.evaluate_models()
+# Create Gradio interface
+def recommend_interface(user_id, n_recommendations, model_choice):
     try:
         user_id = int(user_id)
+        n_recommendations = int(n_recommendations)
+        # Map display names to internal names
+        model_map = {
+            'User-Based CF': 'user_based_cf',
+            'Item-Based CF': 'item_based_cf',
+            'SVD': 'svd',
+            'SVD++': 'svdpp',
+            'NMF': 'nmf',
+            'Ensemble (All Models)': 'ensemble'
+        }
+        model_name = model_map.get(model_choice, 'svd')
+        recommendations = recommender.recommend_movies(user_id, n_recommendations, model_name)
+        if isinstance(recommendations, str):
+            return recommendations
+        # Format output
+        output = f"Top {n_recommendations} recommendations for User {user_id} using {model_choice}:\n\n"
+        for rec in recommendations:
+            output += f"{rec['rank']}. {rec['title']}\n"
+            output += f"   Genres: {rec['genres']}\n"
+            output += f"   Predicted Rating: {rec['predicted_rating']}/5.0\n\n"
+        return output
+    except ValueError:
+        return "Error: Please enter a valid user ID"
     except Exception as e:
+        return f"Error: {str(e)}"
+def show_evaluation():
+    """Display evaluation results"""
+    output = "MODEL EVALUATION RESULTS\n"
+    output += "="*60 + "\n\n"
+    for model_name, metrics in evaluation_results.items():
+        output += f"{model_name.upper().replace('_', ' ')}\n"
+        output += "-"*40 + "\n"
+        for metric, value in metrics.items():
+            output += f"  {metric}: {value:.4f}\n"
+        output += "\n"
+    output += "="*60 + "\n"
+    output += f"BEST MODEL: {best_model_name.upper().replace('_', ' ')}\n"
+    output += "="*60
+    return output
+# Create Gradio interface
+with gr.Blocks(title="MovieLens Recommendation System") as demo:
+    gr.Markdown("# 🎬 MovieLens Recommendation System")
+    gr.Markdown("### Trained on MovieLens 1M Dataset (6,040 users, 3,706 movies)")
+    with gr.Tab("Get Recommendations"):
         with gr.Row():
             with gr.Column():
+                user_input = gr.Textbox(
+                    label="User ID",
+                    placeholder="Enter user ID (1-6040)",
+                    value="1"
                 )
+                n_input = gr.Slider(
+                    minimum=1,
+                    maximum=20,
+                    value=10,
+                    step=1,
+                    label="Number of Recommendations"
                 )
+                model_input = gr.Dropdown(
+                    choices=[
+                        'User-Based CF',
+                        'Item-Based CF',
+                        'SVD',
+                        'SVD++',
+                        'NMF',
+                        'Ensemble (All Models)'
+                    ],
                     value='SVD',
+                    label="Select Model"
+                )
+                recommend_btn = gr.Button("Get Recommendations", variant="primary")
+            with gr.Column():
+                output = gr.Textbox(
+                    label="Recommendations",
+                    lines=20,
+                    max_lines=30
                 )
         recommend_btn.click(
+            fn=recommend_interface,
+            inputs=[user_input, n_input, model_input],
+            outputs=output
         )
+    with gr.Tab("Model Evaluation"):
+        gr.Markdown("## Performance Comparison of All Models")
+        eval_output = gr.Textbox(
+            label="Evaluation Metrics",
+            lines=25,
+            value=show_evaluation()
+        )
+    with gr.Tab("About"):
         gr.Markdown("""
+        ## About This System
+        This recommendation system implements multiple collaborative filtering approaches:
+        ### Models Implemented:
+        1. **User-Based Collaborative Filtering**
+           - Finds similar users based on rating patterns
+           - k=50 neighbors, cosine similarity
+        2. **Item-Based Collaborative Filtering**
+           - Recommends items similar to those you liked
+           - k=40 neighbors, cosine similarity
+        3. **SVD (Singular Value Decomposition)**
+           - Matrix factorization with 150 latent factors
+           - 30 epochs, optimized for MovieLens 1M
+        4. **SVD++ (Enhanced SVD)**
+           - Includes implicit feedback signals
+           - 100 factors, 20 epochs
+        5. **NMF (Non-negative Matrix Factorization)**
+           - Alternative factorization method
+           - 50 factors, 50 epochs
+        6. **Ensemble**
+           - Weighted combination of all models
+           - Leverages strengths of each approach
+        ### Evaluation Metrics:
+        - **RMSE/MAE**: Prediction accuracy
+        - **Precision@10**: Relevance of top 10 recommendations
+        - **Recall@10**: Coverage of relevant items
+        - **NDCG@10**: Ranking quality
+        ### Dataset:
+        MovieLens 1M - 1 million ratings from 6,040 users on 3,706 movies
         """)
 demo.launch()