import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer import gradio as gr # ============================================================================ # STEP 1: DATA SIMULATION & USER SETUP # ============================================================================ # Fixed User ID - This user does not exist in the original dataset TARGET_USER_ID = 9999 # Create synthetic user history (5 movies with high ratings) initial_user_history = pd.DataFrame({ 'userId': [TARGET_USER_ID] * 5, 'movieId': [1, 2, 3, 4, 5], 'rating': [5.0, 4.5, 5.0, 4.0, 4.5], 'timestamp': [1609459200] * 5, # Fixed timestamp 'title': [ 'Toy Story (1995)', 'The Matrix (1999)', 'Inception (2010)', 'The Lion King (1994)', 'Interstellar (2014)' ], 'genres': [ 'Animation|Children|Comedy', 'Action|Sci-Fi|Thriller', 'Action|Sci-Fi|Thriller', 'Animation|Children|Drama|Musical', 'Adventure|Drama|Sci-Fi' ] }) # Global variable to track dynamic user history synthetic_user_history = initial_user_history.copy() # Load the new combined sampled data combined_sampled_data = pd.read_csv('./sampled_movie_ratings_for_gradio.csv') # Extract raw_ratings_df from the combined data raw_ratings_df = combined_sampled_data[['userId', 'movieId', 'rating', 'timestamp']].copy() # Extract full_movies_metadata_df and movies_db from the combined data # Ensure unique movies are taken for metadata purposes full_movies_metadata_df = combined_sampled_data[['movieId', 'title', 'genres']].drop_duplicates(subset=['movieId']).copy() movies_db = full_movies_metadata_df[['movieId', 'title', 'genres']].copy() # ratings_df will be the base for all user ratings (mock + synthetic) ratings_df = raw_ratings_df[['userId', 'movieId', 'rating']].copy() # Get list of all movie titles for autocomplete all_movie_titles = movies_db['title'].tolist() # Create a mock ratings dataset (simulate other users rating movies) np.random.seed(42) n_users = 100 n_movies = len(movies_db) # Use the number of movies in our metadata DB mock_ratings = [] for user_id in range(1, n_users + 1): # Each user rates 5-15 random movies n_ratings = np.random.randint(5, 16) # Sample movieId from movies_db to ensure consistency movie_ids = np.random.choice(movies_db['movieId'].values, n_ratings, replace=False) ratings = np.random.uniform(2.5, 5.0, n_ratings) for movie_id, rating in zip(movie_ids, ratings): mock_ratings.append({ 'userId': user_id, 'movieId': movie_id, 'rating': round(rating, 1) }) # Append mock ratings to the initial ratings_df ratings_df = pd.concat([ratings_df, pd.DataFrame(mock_ratings)], ignore_index=True) # Global variable for movie_id to positional index mapping movie_id_to_idx = {} # Function to rebuild recommendation models based on current history def rebuild_models(): global user_item_matrix, user_similarity_df, content_similarity, movie_id_to_idx # Merge synthetic user into the ratings dataset all_ratings = pd.concat([ ratings_df, synthetic_user_history[['userId', 'movieId', 'rating']] ], ignore_index=True) # Merge with movies_db to get full data (ratings + movie metadata) # Use a left merge to preserve all ratings from all_ratings full_data = pd.merge(all_ratings, movies_db, on='movieId', how='left') # ============================================================================ # STEP 2: BUILD RECOMMENDATION MODELS # ============================================================================ # Create user-item matrix for collaborative filtering user_item_matrix = full_data.pivot_table( index='userId', columns='movieId', values='rating' # 'rating' column is now unambiguously from all_ratings ).fillna(0) # Calculate user-user similarity matrix user_similarity = cosine_similarity(user_item_matrix) user_similarity_df = pd.DataFrame( user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index ) # Create content-based filtering using TF-IDF on genres tfidf = TfidfVectorizer(token_pattern=r'[^|]+') tfidf_matrix = tfidf.fit_transform(movies_db['genres']) content_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix) # Create a mapping from movieId to its 0-based positional index in movies_db movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movies_db['movieId'])} # Initialize models rebuild_models() # ============================================================================ # STEP 3: RECOMMENDATION FUNCTIONS # ============================================================================ def get_similar_users(user_id, n=5): """ Get top N most similar users based on collaborative filtering. Returns DataFrame with similar_user_id and similarity_score. """ if user_id not in user_similarity_df.index: return pd.DataFrame(columns=['Similar User ID', 'Similarity Score']) similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n+1] result_df = pd.DataFrame({ 'Similar User ID': similar_users.index, 'Similarity Score': [round(score, 4) for score in similar_users.values] }) return result_df def get_top_movies(user_id, search_query=None, n=15, alpha=0.6): """ Get top N recommended movies using hybrid filtering. Args: user_id: Target user ID search_query: Optional search filter (movie title) n: Number of recommendations alpha: Weight for collaborative filtering (1-alpha for content-based) Returns DataFrame with Movie Title, Genre, and Estimated Score. """ if user_id not in user_item_matrix.index: return pd.DataFrame(columns=['Movie Title', 'Genre', 'Estimated Score']) # Get movies the user hasn't rated user_ratings = user_item_matrix.loc[user_id] unrated_movies = user_ratings[user_ratings == 0].index.tolist() # Calculate scores for unrated movies movie_scores = [] for movie_id in unrated_movies: # Collaborative filtering score similar_users = get_similar_users(user_id, n=10) collab_score = 0 if len(similar_users) > 0: for _, row in similar_users.iterrows(): sim_user_id = row['Similar User ID'] similarity = row['Similarity Score'] if sim_user_id in user_item_matrix.index: rating = user_item_matrix.loc[sim_user_id, movie_id] if rating > 0: collab_score += similarity * rating collab_score = collab_score / (similar_users['Similarity Score'].sum() + 1e-10) # Content-based score user_rated_movies = user_ratings[user_ratings > 0].index.tolist() content_score = 0 if len(user_rated_movies) > 0: # Find the positional index of the movie in movies_db for content_similarity # Ensure movie_id exists in movies_db before proceeding if movie_id in movie_id_to_idx: movie_idx = movie_id_to_idx[movie_id] for rated_movie_id in user_rated_movies: # Ensure rated_movie_id exists in movies_db before proceeding if rated_movie_id in movie_id_to_idx: rated_idx = movie_id_to_idx[rated_movie_id] # Ensure indices are within bounds of content_similarity if rated_idx < content_similarity.shape[1] and movie_idx < content_similarity.shape[0]: content_score += content_similarity[movie_idx, rated_idx] else: # Handle cases where index might still be out of bounds due to data inconsistencies # This could happen if movies_db was somehow out of sync with content_similarity print(f"Warning: Content similarity index out of bounds for movie_id={movie_id} or rated_movie_id={rated_movie_id}") content_score = content_score / len(user_rated_movies) # Hybrid score hybrid_score = alpha * (collab_score / 5.0) + (1 - alpha) * content_score # Get movie details from movies_db (metadata only) # Ensure movie_id exists in movies_db before proceeding if movie_id in movies_db['movieId'].values: movie_info = movies_db[movies_db['movieId'] == movie_id].iloc[0] movie_scores.append({ 'Movie Title': movie_info['title'], 'Genre': movie_info['genres'], 'Estimated Score': round(hybrid_score * 5, 2) # Scale to 0-5 range }) # Sort by score result_df = pd.DataFrame(movie_scores).sort_values( 'Estimated Score', ascending=False ) # Apply search filter if provided (exact match or partial) if search_query and search_query.strip(): # Check if it's an exact match from autocomplete if search_query in movies_db['title'].values: result_df = result_df[result_df['Movie Title'] == search_query] else: # Fallback to partial match query = search_query.lower() result_df = result_df[ result_df['Movie Title'].str.lower().str.contains(query, na=False) ] result_df = result_df.head(n) return result_df.reset_index(drop=True) def get_recommendations(search_query=None): """ Main function to get all recommendations for the target user. Returns: user_history_df, similar_users_df, recommended_movies_df """ # Get similar users similar_users = get_similar_users(TARGET_USER_ID, n=5) # Get recommended movies recommended_movies = get_top_movies(TARGET_USER_ID, search_query=search_query, n=15) # Ensure we have data to display if len(recommended_movies) == 0: # If no results, show top recommendations without filter recommended_movies = get_top_movies(TARGET_USER_ID, search_query=None, n=15) return synthetic_user_history, similar_users, recommended_movies def add_movie_to_history(movie_title): """ Add a searched movie to user's history with a default rating. Rebuilds recommendation models after adding. """ global synthetic_user_history if not movie_title or not movie_title.strip(): return # Check if movie exists in database movie_match = movies_db[movies_db['title'] == movie_title] if len(movie_match) == 0: return # Check if movie is already in user's history if movie_title in synthetic_user_history['title'].values: return # Get movie details movie_info = movie_match.iloc[0] # Create new history entry with a default rating of 4.0 new_entry = pd.DataFrame({ 'userId': [TARGET_USER_ID], 'movieId': [movie_info['movieId']], 'rating': [4.0], 'timestamp': [1609459200], 'title': [movie_info['title']], 'genres': [movie_info['genres']] }) # Add to history synthetic_user_history = pd.concat([synthetic_user_history, new_entry], ignore_index=True) # Rebuild recommendation models rebuild_models() # ============================================================================ # STEP 4: GRADIO INTERFACE # ============================================================================ def update_recommendations(search_query): """Update recommendations based on search query.""" # If search query is provided and is a valid movie title from our database if search_query and search_query.strip() and search_query in all_movie_titles: # Add movie to history add_movie_to_history(search_query) # Get updated recommendations user_history, similar_users, recommended_movies = get_recommendations(None) status_msg = f"✅ Added '{search_query}' to your watch history! Recommendations updated." return user_history, similar_users, recommended_movies, status_msg elif search_query and search_query.strip(): # If it's a custom search term (partial match) user_history, similar_users, recommended_movies = get_recommendations(search_query) if len(recommended_movies) == 0: status_msg = f"⚠️ No recommendations found for '{search_query}'. Showing all recommendations." user_history, similar_users, recommended_movies = get_recommendations(None) else: status_msg = f"🔍 Showing recommendations matching: {search_query}" return user_history, similar_users, recommended_movies, status_msg else: # No search query - show all recommendations user_history, similar_users, recommended_movies = get_recommendations(None) status_msg = "📋 Showing all recommendations" return user_history, similar_users, recommended_movies, status_msg def reset_history(): """ Reset user history to initial state. """ global synthetic_user_history synthetic_user_history = initial_user_history.copy() rebuild_models() user_history, similar_users, recommended_movies = get_recommendations(None) return user_history, similar_users, recommended_movies, "🔄 History reset to initial state" # Build the Gradio interface with gr.Blocks(title="Movie Recommendation System") as demo: gr.Markdown( f""" # 🎬 Movie Recommendation System ### Personalized Recommendations for User #{TARGET_USER_ID} This system uses **hybrid filtering** (collaborative + content-based) to recommend movies. **Select a movie from the dropdown to add it to your watch history and get updated recommendations!** """ ) # Section 1: User Profile with gr.Row(): gr.Markdown("## 👤 Your Watch History") with gr.Row(): user_history_table = gr.Dataframe( value=synthetic_user_history, label="Rated Movies", interactive=False, wrap=True ) # Section 2: Search & Control with gr.Row(): gr.Markdown("## 🔍 Search & Add Movies to History") with gr.Row(): search_box = gr.Dropdown( choices=all_movie_titles, label="Search Movies (Autocomplete)", info="Type to search, then select a movie to add it to your history and update recommendations", allow_custom_value=True, interactive=True ) # search_btn = gr.Button("➕ Add to History", variant="primary", scale=0) #clear_btn = gr.Button("🔄 Show All", variant="secondary", scale=0) reset_btn = gr.Button("↺ Reset History", variant="secondary", scale=0) with gr.Row(): status_text = gr.Markdown("📋 Showing all recommendations") # Section 3: Results gr.Markdown("## 📊 Recommendation Results") with gr.Row(): # Table A: Similar Users with gr.Column(scale=1): gr.Markdown("### Similar Users") gr.Markdown("*Users with similar taste in movies*") similar_users_table = gr.Dataframe( label="", interactive=False, wrap=True ) # Table B: Recommended Movies with gr.Column(scale=1): gr.Markdown("### Recommended Movies") gr.Markdown("*Top picks based on your preferences*") recommended_movies_table = gr.Dataframe( label="", interactive=False, wrap=True ) # Load initial data demo.load( fn=get_recommendations, inputs=[], outputs=[user_history_table, similar_users_table, recommended_movies_table] ) # Search functionality #search_btn.click( # fn=update_recommendations, # inputs=[search_box], # outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text] #) # Clear filter functionality # clear_btn.click( # fn=lambda: update_recommendations(None), # inputs=[], # outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text] #) # Reset history functionality reset_btn.click( fn=reset_history, inputs=[], outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text] ) # Dropdown change event - triggers when selecting from dropdown search_box.select( fn=update_recommendations, inputs=[search_box], outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text] ) gr.Markdown( """ --- **Algorithm Details:** - **Collaborative Filtering:** Finds users with similar rating patterns using cosine similarity - **Content-Based Filtering:** Matches movies by genre similarity using TF-IDF vectorization - **Hybrid Score:** Weighted combination (60% collaborative, 40% content-based) **How to Use:** 1. View your watch history above (starts with 5 movies) 2. Use the dropdown to search and select movies 3. Click "Add to History" or select from dropdown to add movie to your history (rated 4.0 by default) 4. Watch your recommendations update in real-time based on your new preferences! 5. Click "Reset History" to return to the original 5 movies **Note:** Adding movies to your history will immediately update your recommendations based on your expanded taste profile. """ ) # Launch the app if __name__ == "__main__": demo.launch(debug=True)