import gradio as gr import pickle import pandas as pd import numpy as np import torch import torch.nn as nn import os from scipy.sparse import csr_matrix class ItemBasedCF: def __init__(self, n_neighbors=20): self.n_neighbors = n_neighbors self.item_similarity = None self.user_item_matrix = None def predict(self, user_idx, movie_idx): user_ratings = self.user_item_matrix[user_idx].toarray().flatten() rated_mask = user_ratings > 0 if not rated_mask.any(): return 2.5 similarities = self.item_similarity[movie_idx].toarray().flatten() weights = similarities * rated_mask if weights.sum() == 0: return 2.5 prediction = (weights * user_ratings).sum() / weights.sum() return np.clip(prediction, 1, 5) class SVDRecommender: def __init__(self, n_factors=50): self.n_factors = n_factors self.user_factors = None self.item_factors = None self.global_mean = 3.5 def predict(self, user_idx, movie_idx): prediction = self.global_mean + np.dot(self.user_factors[user_idx], self.item_factors[movie_idx]) return np.clip(prediction, 1, 5) class NeuralCF(nn.Module): def __init__(self, n_users, n_movies, embedding_dim=50, hidden_layers=[64, 32, 16]): super(NeuralCF, self).__init__() self.user_embedding = nn.Embedding(n_users, embedding_dim) self.movie_embedding = nn.Embedding(n_movies, embedding_dim) layers = [] input_dim = embedding_dim * 2 for hidden_dim in hidden_layers: layers.append(nn.Linear(input_dim, hidden_dim)) layers.append(nn.ReLU()) layers.append(nn.Dropout(0.2)) input_dim = hidden_dim layers.append(nn.Linear(input_dim, 1)) self.mlp = nn.Sequential(*layers) def forward(self, user_ids, movie_ids): user_emb = self.user_embedding(user_ids) movie_emb = self.movie_embedding(movie_ids) x = torch.cat([user_emb, movie_emb], dim=1) output = self.mlp(x) return output.squeeze() def predict(self, user_idx, movie_idx, device='cpu'): self.eval() with torch.no_grad(): user_tensor = torch.LongTensor([user_idx]).to(device) movie_tensor = torch.LongTensor([movie_idx]).to(device) prediction = self.forward(user_tensor, movie_tensor) return torch.clamp(prediction, 1, 5).item() class HybridRecommender: def __init__(self, n_users, n_movies): self.n_users = n_users self.n_movies = n_movies self.item_cf = None self.svd = None self.ncf = None self.weights = { 'item_cf': 0.3, 'svd': 0.4, 'ncf': 0.3 } def predict(self, user_idx, movie_idx): cf_pred = self.item_cf.predict(user_idx, movie_idx) svd_pred = self.svd.predict(user_idx, movie_idx) ncf_pred = self.ncf.predict(user_idx, movie_idx) prediction = ( self.weights['item_cf'] * cf_pred + self.weights['svd'] * svd_pred + self.weights['ncf'] * ncf_pred ) return np.clip(prediction, 1, 5) def recommend_movies(self, user_id, N=10, user_id_map=None, reverse_movie_map=None, movies_df=None): if user_id_map is not None: if user_id not in user_id_map: return [] user_idx = user_id_map[user_id] else: user_idx = user_id rated_movies = set(np.where(self.item_cf.user_item_matrix[user_idx].toarray().flatten() > 0)[0]) scores = [] for movie_idx in range(self.n_movies): if movie_idx not in rated_movies: score = self.predict(user_idx, movie_idx) scores.append((movie_idx, score)) scores.sort(key=lambda x: x[1], reverse=True) top_recommendations = scores[:N] recommendations = [] for movie_idx, score in top_recommendations: if reverse_movie_map is not None: original_movie_id = reverse_movie_map[movie_idx] else: original_movie_id = movie_idx if movies_df is not None: title = movies_df[movies_df['movie_id'] == original_movie_id]['title'].values[0] else: title = f"Movie {original_movie_id}" recommendations.append((original_movie_id, title, score)) return recommendations class MovieLensDataLoader: def __init__(self, ratings_path=None, movies_path=None): self.ratings_path = ratings_path self.movies_path = movies_path self.user_id_map = {} self.movie_id_map = {} self.reverse_user_map = {} self.reverse_movie_map = {} def load_model_and_data(): import os print("Checking for files...") print(f"Current directory: {os.getcwd()}") print(f"Files in current directory: {os.listdir('.')}") if os.path.exists('model_artifacts'): print(f"Files in model_artifacts/: {os.listdir('model_artifacts')}") else: print("ERROR: model_artifacts/ folder does not exist!") try: files_to_check = [ 'model_artifacts/hybrid_model.pkl', 'model_artifacts/loader.pkl', 'model_artifacts/movies.pkl' ] for file_path in files_to_check: if not os.path.exists(file_path): print(f"ERROR: Missing file: {file_path}") else: file_size = os.path.getsize(file_path) / (1024*1024) print(f"Found: {file_path} ({file_size:.2f} MB)") with open('model_artifacts/hybrid_model.pkl', 'rb') as f: model = pickle.load(f) print("Loaded hybrid_model.pkl") with open('model_artifacts/loader.pkl', 'rb') as f: loader = pickle.load(f) print("Loaded loader.pkl") with open('model_artifacts/movies.pkl', 'rb') as f: movies = pickle.load(f) print("Loaded movies.pkl") user_ids = sorted(loader.user_id_map.keys()) print(f"Model loaded successfully! {len(user_ids)} users available") return model, loader, movies, user_ids except FileNotFoundError as e: print(f"ERROR: File not found - {e}") print("Make sure all pkl files are in the model_artifacts/ folder") return None, None, None, [] except Exception as e: print(f"ERROR loading model: {type(e).__name__}: {e}") import traceback traceback.print_exc() return None, None, None, [] print("Loading model and data...") model, loader, movies_df, user_ids = load_model_and_data() print(f"Model loaded! Available users: {len(user_ids)}") def get_recommendations(user_id, num_recommendations): if model is None or loader is None: return "Error: Model not loaded properly. Please check the model files." try: user_id = int(user_id) num_recommendations = int(num_recommendations) if user_id not in loader.user_id_map: return f"User ID {user_id} not found! Please select a valid user ID." recommendations = model.recommend_movies( user_id=user_id, N=num_recommendations, user_id_map=loader.user_id_map, reverse_movie_map=loader.reverse_movie_map, movies_df=movies_df ) if not recommendations: return f"No recommendations found for User {user_id}" output = f"Top {num_recommendations} Movie Recommendations for User {user_id}\n\n" output += "=" * 60 + "\n\n" for i, (movie_id, title, score) in enumerate(recommendations, 1): stars = "*" * int(score) output += f"{i}. {title}\n" output += f" Predicted Rating: {score:.2f}/5.00 {stars}\n" output += f" Movie ID: {movie_id}\n\n" return output except ValueError: return "Error: Please enter valid numbers for User ID and Number of Recommendations" except Exception as e: return f"Error generating recommendations: {str(e)}" def get_user_history(user_id): if model is None or loader is None: return "Error: Model not loaded properly." try: user_id = int(user_id) if user_id not in loader.user_id_map: return f"User ID {user_id} not found!" user_idx = loader.user_id_map[user_id] user_ratings = model.item_cf.user_item_matrix[user_idx].toarray().flatten() rated_indices = np.where(user_ratings > 0)[0] if len(rated_indices) == 0: return f"No rating history found for User {user_id}" history = [] for movie_idx in rated_indices: original_movie_id = loader.reverse_movie_map[movie_idx] title = movies_df[movies_df['movie_id'] == original_movie_id]['title'].values[0] rating = user_ratings[movie_idx] history.append((title, rating)) history.sort(key=lambda x: x[1], reverse=True) output = f"Rating History for User {user_id}\n\n" output += f"Total movies rated: {len(history)}\n" output += f"Average rating: {np.mean([r for _, r in history]):.2f}\n\n" output += "=" * 60 + "\n\n" output += "Top 10 Highest Rated Movies:\n\n" for i, (title, rating) in enumerate(history[:10], 1): stars = "*" * int(rating) output += f"{i}. {title} - {rating:.1f}/5 {stars}\n" return output except Exception as e: return f"Error: {str(e)}" def get_movie_info(movie_title_search): if movies_df is None: return "Error: Movies data not loaded" try: matches = movies_df[movies_df['title'].str.contains(movie_title_search, case=False, na=False)] if len(matches) == 0: return f"No movies found matching '{movie_title_search}'" output = f"Search Results for '{movie_title_search}'\n\n" output += f"Found {len(matches)} movie(s):\n\n" output += "=" * 60 + "\n\n" for i, (_, row) in enumerate(matches.head(20).iterrows(), 1): output += f"{i}. {row['title']} (ID: {row['movie_id']})\n" if len(matches) > 20: output += f"\n... and {len(matches) - 20} more results" return output except Exception as e: return f"Error: {str(e)}" with gr.Blocks(theme=gr.themes.Soft(), title="Movie Recommender - DataSynthis") as demo: gr.Markdown(""" # Hybrid Movie Recommendation System ### DataSynthis Job Task - Powered by AI This system combines Collaborative Filtering, SVD Matrix Factorization, and Neural Networks to provide personalized movie recommendations from the MovieLens 1M dataset. """) with gr.Tabs(): with gr.Tab("Get Recommendations"): gr.Markdown("### Get personalized movie recommendations for any user") with gr.Row(): with gr.Column(scale=1): user_id_input = gr.Number( label="User ID", value=1, minimum=1, maximum=6040, step=1, info=f"Enter a user ID (1-6040)" ) num_recs_input = gr.Slider( label="Number of Recommendations", minimum=5, maximum=20, value=10, step=1 ) recommend_btn = gr.Button("Get Recommendations", variant="primary") with gr.Column(scale=2): recommendations_output = gr.Textbox( label="Recommendations", lines=20, max_lines=30 ) recommend_btn.click( fn=get_recommendations, inputs=[user_id_input, num_recs_input], outputs=recommendations_output ) gr.Markdown(""" **How it works:** - Enter a User ID (between 1 and 6040) - Choose how many recommendations you want - Click "Get Recommendations" to see personalized movie suggestions """) with gr.Tab("User History"): gr.Markdown("### View a user's rating history") with gr.Row(): with gr.Column(scale=1): user_id_history = gr.Number( label="User ID", value=1, minimum=1, maximum=6040, step=1 ) history_btn = gr.Button("View History", variant="primary") with gr.Column(scale=2): history_output = gr.Textbox( label="Rating History", lines=20, max_lines=30 ) history_btn.click( fn=get_user_history, inputs=user_id_history, outputs=history_output ) with gr.Tab("Search Movies"): gr.Markdown("### Search for movies in the database") with gr.Row(): with gr.Column(scale=1): movie_search = gr.Textbox( label="Movie Title Search", placeholder="e.g., Star Wars, Godfather, Titanic...", value="Star Wars" ) search_btn = gr.Button("Search", variant="primary") with gr.Column(scale=2): search_output = gr.Textbox( label="Search Results", lines=20, max_lines=30 ) search_btn.click( fn=get_movie_info, inputs=movie_search, outputs=search_output ) with gr.Tab("About"): gr.Markdown(""" ## About This System ### Model Architecture This is a Hybrid Recommendation System that combines three powerful approaches: 1. Item-Based Collaborative Filtering - Uses cosine similarity between movies - Recommends movies similar to what you've liked before 2. SVD Matrix Factorization - Decomposes the user-movie rating matrix - Discovers latent factors that explain user preferences 3. Neural Collaborative Filtering (NCF) - Deep learning model with user and movie embeddings - Learns complex non-linear patterns in user behavior ### Dataset - MovieLens 1M dataset - 1,000,209 ratings from 6,040 users on 3,900 movies - Ratings scale: 1-5 stars ### Performance Metrics - Precision@10: 26.77% - NDCG@10: 28.50% - Model improves recommendations by 40% vs baseline ### Created For DataSynthis Job Task ### Technologies Used - PyTorch (Neural Networks) - Scikit-learn (SVD, Similarity) - Pandas & NumPy (Data Processing) - Gradio (Web Interface) Note: This model is trained on the MovieLens 1M dataset. User IDs range from 1 to 6040, and movie IDs range from 1 to 3952. """) gr.Markdown(""" --- Hybrid Movie Recommendation System | Built for DataSynthis """) if __name__ == "__main__": demo.launch( share=False, server_name="0.0.0.0", server_port=7860 )