Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import gradio as gr | |
| # ============================================================================ | |
| # STEP 1: DATA SIMULATION & USER SETUP | |
| # ============================================================================ | |
| # Fixed User ID - This user does not exist in the original dataset | |
| TARGET_USER_ID = 9999 | |
| # Create synthetic user history (5 movies with high ratings) | |
| initial_user_history = pd.DataFrame({ | |
| 'userId': [TARGET_USER_ID] * 5, | |
| 'movieId': [1, 2, 3, 4, 5], | |
| 'rating': [5.0, 4.5, 5.0, 4.0, 4.5], | |
| 'timestamp': [1609459200] * 5, # Fixed timestamp | |
| 'title': [ | |
| 'Toy Story (1995)', | |
| 'The Matrix (1999)', | |
| 'Inception (2010)', | |
| 'The Lion King (1994)', | |
| 'Interstellar (2014)' | |
| ], | |
| 'genres': [ | |
| 'Animation|Children|Comedy', | |
| 'Action|Sci-Fi|Thriller', | |
| 'Action|Sci-Fi|Thriller', | |
| 'Animation|Children|Drama|Musical', | |
| 'Adventure|Drama|Sci-Fi' | |
| ] | |
| }) | |
| # Global variable to track dynamic user history | |
| synthetic_user_history = initial_user_history.copy() | |
| # Load the new combined sampled data | |
| combined_sampled_data = pd.read_csv('./sampled_movie_ratings_for_gradio.csv') | |
| # Extract raw_ratings_df from the combined data | |
| raw_ratings_df = combined_sampled_data[['userId', 'movieId', 'rating', 'timestamp']].copy() | |
| # Extract full_movies_metadata_df and movies_db from the combined data | |
| # Ensure unique movies are taken for metadata purposes | |
| full_movies_metadata_df = combined_sampled_data[['movieId', 'title', 'genres']].drop_duplicates(subset=['movieId']).copy() | |
| movies_db = full_movies_metadata_df[['movieId', 'title', 'genres']].copy() | |
| # ratings_df will be the base for all user ratings (mock + synthetic) | |
| ratings_df = raw_ratings_df[['userId', 'movieId', 'rating']].copy() | |
| # Get list of all movie titles for autocomplete | |
| all_movie_titles = movies_db['title'].tolist() | |
| # Create a mock ratings dataset (simulate other users rating movies) | |
| np.random.seed(42) | |
| n_users = 100 | |
| n_movies = len(movies_db) # Use the number of movies in our metadata DB | |
| mock_ratings = [] | |
| for user_id in range(1, n_users + 1): | |
| # Each user rates 5-15 random movies | |
| n_ratings = np.random.randint(5, 16) | |
| # Sample movieId from movies_db to ensure consistency | |
| movie_ids = np.random.choice(movies_db['movieId'].values, n_ratings, replace=False) | |
| ratings = np.random.uniform(2.5, 5.0, n_ratings) | |
| for movie_id, rating in zip(movie_ids, ratings): | |
| mock_ratings.append({ | |
| 'userId': user_id, | |
| 'movieId': movie_id, | |
| 'rating': round(rating, 1) | |
| }) | |
| # Append mock ratings to the initial ratings_df | |
| ratings_df = pd.concat([ratings_df, pd.DataFrame(mock_ratings)], ignore_index=True) | |
| # Global variable for movie_id to positional index mapping | |
| movie_id_to_idx = {} | |
| # Function to rebuild recommendation models based on current history | |
| def rebuild_models(): | |
| global user_item_matrix, user_similarity_df, content_similarity, movie_id_to_idx | |
| # Merge synthetic user into the ratings dataset | |
| all_ratings = pd.concat([ | |
| ratings_df, | |
| synthetic_user_history[['userId', 'movieId', 'rating']] | |
| ], ignore_index=True) | |
| # Merge with movies_db to get full data (ratings + movie metadata) | |
| # Use a left merge to preserve all ratings from all_ratings | |
| full_data = pd.merge(all_ratings, movies_db, on='movieId', how='left') | |
| # ============================================================================ | |
| # STEP 2: BUILD RECOMMENDATION MODELS | |
| # ============================================================================ | |
| # Create user-item matrix for collaborative filtering | |
| user_item_matrix = full_data.pivot_table( | |
| index='userId', | |
| columns='movieId', | |
| values='rating' # 'rating' column is now unambiguously from all_ratings | |
| ).fillna(0) | |
| # Calculate user-user similarity matrix | |
| user_similarity = cosine_similarity(user_item_matrix) | |
| user_similarity_df = pd.DataFrame( | |
| user_similarity, | |
| index=user_item_matrix.index, | |
| columns=user_item_matrix.index | |
| ) | |
| # Create content-based filtering using TF-IDF on genres | |
| tfidf = TfidfVectorizer(token_pattern=r'[^|]+') | |
| tfidf_matrix = tfidf.fit_transform(movies_db['genres']) | |
| content_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix) | |
| # Create a mapping from movieId to its 0-based positional index in movies_db | |
| movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movies_db['movieId'])} | |
| # Initialize models | |
| rebuild_models() | |
| # ============================================================================ | |
| # STEP 3: RECOMMENDATION FUNCTIONS | |
| # ============================================================================ | |
| def get_similar_users(user_id, n=5): | |
| """ | |
| Get top N most similar users based on collaborative filtering. | |
| Returns DataFrame with similar_user_id and similarity_score. | |
| """ | |
| if user_id not in user_similarity_df.index: | |
| return pd.DataFrame(columns=['Similar User ID', 'Similarity Score']) | |
| similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n+1] | |
| result_df = pd.DataFrame({ | |
| 'Similar User ID': similar_users.index, | |
| 'Similarity Score': [round(score, 4) for score in similar_users.values] | |
| }) | |
| return result_df | |
| def get_top_movies(user_id, search_query=None, n=15, alpha=0.6): | |
| """ | |
| Get top N recommended movies using hybrid filtering. | |
| Args: | |
| user_id: Target user ID | |
| search_query: Optional search filter (movie title) | |
| n: Number of recommendations | |
| alpha: Weight for collaborative filtering (1-alpha for content-based) | |
| Returns DataFrame with Movie Title, Genre, and Estimated Score. | |
| """ | |
| if user_id not in user_item_matrix.index: | |
| return pd.DataFrame(columns=['Movie Title', 'Genre', 'Estimated Score']) | |
| # Get movies the user hasn't rated | |
| user_ratings = user_item_matrix.loc[user_id] | |
| unrated_movies = user_ratings[user_ratings == 0].index.tolist() | |
| # Calculate scores for unrated movies | |
| movie_scores = [] | |
| for movie_id in unrated_movies: | |
| # Collaborative filtering score | |
| similar_users = get_similar_users(user_id, n=10) | |
| collab_score = 0 | |
| if len(similar_users) > 0: | |
| for _, row in similar_users.iterrows(): | |
| sim_user_id = row['Similar User ID'] | |
| similarity = row['Similarity Score'] | |
| if sim_user_id in user_item_matrix.index: | |
| rating = user_item_matrix.loc[sim_user_id, movie_id] | |
| if rating > 0: | |
| collab_score += similarity * rating | |
| collab_score = collab_score / (similar_users['Similarity Score'].sum() + 1e-10) | |
| # Content-based score | |
| user_rated_movies = user_ratings[user_ratings > 0].index.tolist() | |
| content_score = 0 | |
| if len(user_rated_movies) > 0: | |
| # Find the positional index of the movie in movies_db for content_similarity | |
| # Ensure movie_id exists in movies_db before proceeding | |
| if movie_id in movie_id_to_idx: | |
| movie_idx = movie_id_to_idx[movie_id] | |
| for rated_movie_id in user_rated_movies: | |
| # Ensure rated_movie_id exists in movies_db before proceeding | |
| if rated_movie_id in movie_id_to_idx: | |
| rated_idx = movie_id_to_idx[rated_movie_id] | |
| # Ensure indices are within bounds of content_similarity | |
| if rated_idx < content_similarity.shape[1] and movie_idx < content_similarity.shape[0]: | |
| content_score += content_similarity[movie_idx, rated_idx] | |
| else: | |
| # Handle cases where index might still be out of bounds due to data inconsistencies | |
| # This could happen if movies_db was somehow out of sync with content_similarity | |
| print(f"Warning: Content similarity index out of bounds for movie_id={movie_id} or rated_movie_id={rated_movie_id}") | |
| content_score = content_score / len(user_rated_movies) | |
| # Hybrid score | |
| hybrid_score = alpha * (collab_score / 5.0) + (1 - alpha) * content_score | |
| # Get movie details from movies_db (metadata only) | |
| # Ensure movie_id exists in movies_db before proceeding | |
| if movie_id in movies_db['movieId'].values: | |
| movie_info = movies_db[movies_db['movieId'] == movie_id].iloc[0] | |
| movie_scores.append({ | |
| 'Movie Title': movie_info['title'], | |
| 'Genre': movie_info['genres'], | |
| 'Estimated Score': round(hybrid_score * 5, 2) # Scale to 0-5 range | |
| }) | |
| # Sort by score | |
| result_df = pd.DataFrame(movie_scores).sort_values( | |
| 'Estimated Score', ascending=False | |
| ) | |
| # Apply search filter if provided (exact match or partial) | |
| if search_query and search_query.strip(): | |
| # Check if it's an exact match from autocomplete | |
| if search_query in movies_db['title'].values: | |
| result_df = result_df[result_df['Movie Title'] == search_query] | |
| else: | |
| # Fallback to partial match | |
| query = search_query.lower() | |
| result_df = result_df[ | |
| result_df['Movie Title'].str.lower().str.contains(query, na=False) | |
| ] | |
| result_df = result_df.head(n) | |
| return result_df.reset_index(drop=True) | |
| def get_recommendations(search_query=None): | |
| """ | |
| Main function to get all recommendations for the target user. | |
| Returns: user_history_df, similar_users_df, recommended_movies_df | |
| """ | |
| # Get similar users | |
| similar_users = get_similar_users(TARGET_USER_ID, n=5) | |
| # Get recommended movies | |
| recommended_movies = get_top_movies(TARGET_USER_ID, search_query=search_query, n=15) | |
| # Ensure we have data to display | |
| if len(recommended_movies) == 0: | |
| # If no results, show top recommendations without filter | |
| recommended_movies = get_top_movies(TARGET_USER_ID, search_query=None, n=15) | |
| return synthetic_user_history, similar_users, recommended_movies | |
| def add_movie_to_history(movie_title): | |
| """ | |
| Add a searched movie to user's history with a default rating. | |
| Rebuilds recommendation models after adding. | |
| """ | |
| global synthetic_user_history | |
| if not movie_title or not movie_title.strip(): | |
| return | |
| # Check if movie exists in database | |
| movie_match = movies_db[movies_db['title'] == movie_title] | |
| if len(movie_match) == 0: | |
| return | |
| # Check if movie is already in user's history | |
| if movie_title in synthetic_user_history['title'].values: | |
| return | |
| # Get movie details | |
| movie_info = movie_match.iloc[0] | |
| # Create new history entry with a default rating of 4.0 | |
| new_entry = pd.DataFrame({ | |
| 'userId': [TARGET_USER_ID], | |
| 'movieId': [movie_info['movieId']], | |
| 'rating': [4.0], | |
| 'timestamp': [1609459200], | |
| 'title': [movie_info['title']], | |
| 'genres': [movie_info['genres']] | |
| }) | |
| # Add to history | |
| synthetic_user_history = pd.concat([synthetic_user_history, new_entry], ignore_index=True) | |
| # Rebuild recommendation models | |
| rebuild_models() | |
| # ============================================================================ | |
| # STEP 4: GRADIO INTERFACE | |
| # ============================================================================ | |
| def update_recommendations(search_query): | |
| """Update recommendations based on search query.""" | |
| # If search query is provided and is a valid movie title from our database | |
| if search_query and search_query.strip() and search_query in all_movie_titles: | |
| # Add movie to history | |
| add_movie_to_history(search_query) | |
| # Get updated recommendations | |
| user_history, similar_users, recommended_movies = get_recommendations(None) | |
| status_msg = f"β Added '{search_query}' to your watch history! Recommendations updated." | |
| return user_history, similar_users, recommended_movies, status_msg | |
| elif search_query and search_query.strip(): | |
| # If it's a custom search term (partial match) | |
| user_history, similar_users, recommended_movies = get_recommendations(search_query) | |
| if len(recommended_movies) == 0: | |
| status_msg = f"β οΈ No recommendations found for '{search_query}'. Showing all recommendations." | |
| user_history, similar_users, recommended_movies = get_recommendations(None) | |
| else: | |
| status_msg = f"π Showing recommendations matching: {search_query}" | |
| return user_history, similar_users, recommended_movies, status_msg | |
| else: | |
| # No search query - show all recommendations | |
| user_history, similar_users, recommended_movies = get_recommendations(None) | |
| status_msg = "π Showing all recommendations" | |
| return user_history, similar_users, recommended_movies, status_msg | |
| def reset_history(): | |
| """ | |
| Reset user history to initial state. | |
| """ | |
| global synthetic_user_history | |
| synthetic_user_history = initial_user_history.copy() | |
| rebuild_models() | |
| user_history, similar_users, recommended_movies = get_recommendations(None) | |
| return user_history, similar_users, recommended_movies, "π History reset to initial state" | |
| # Build the Gradio interface | |
| with gr.Blocks(title="Movie Recommendation System") as demo: | |
| gr.Markdown( | |
| f""" | |
| # π¬ Movie Recommendation System | |
| ### Personalized Recommendations for User #{TARGET_USER_ID} | |
| This system uses **hybrid filtering** (collaborative + content-based) to recommend movies. | |
| **Select a movie from the dropdown to add it to your watch history and get updated recommendations!** | |
| """ | |
| ) | |
| # Section 1: User Profile | |
| with gr.Row(): | |
| gr.Markdown("## π€ Your Watch History") | |
| with gr.Row(): | |
| user_history_table = gr.Dataframe( | |
| value=synthetic_user_history, | |
| label="Rated Movies", | |
| interactive=False, | |
| wrap=True | |
| ) | |
| # Section 2: Search & Control | |
| with gr.Row(): | |
| gr.Markdown("## π Search & Add Movies to History") | |
| with gr.Row(): | |
| search_box = gr.Dropdown( | |
| choices=all_movie_titles, | |
| label="Search Movies (Autocomplete)", | |
| info="Type to search, then select a movie to add it to your history and update recommendations", | |
| allow_custom_value=True, | |
| interactive=True | |
| ) | |
| # search_btn = gr.Button("β Add to History", variant="primary", scale=0) | |
| #clear_btn = gr.Button("π Show All", variant="secondary", scale=0) | |
| reset_btn = gr.Button("βΊ Reset History", variant="secondary", scale=0) | |
| with gr.Row(): | |
| status_text = gr.Markdown("π Showing all recommendations") | |
| # Section 3: Results | |
| gr.Markdown("## π Recommendation Results") | |
| with gr.Row(): | |
| # Table A: Similar Users | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Similar Users") | |
| gr.Markdown("*Users with similar taste in movies*") | |
| similar_users_table = gr.Dataframe( | |
| label="", | |
| interactive=False, | |
| wrap=True | |
| ) | |
| # Table B: Recommended Movies | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Recommended Movies") | |
| gr.Markdown("*Top picks based on your preferences*") | |
| recommended_movies_table = gr.Dataframe( | |
| label="", | |
| interactive=False, | |
| wrap=True | |
| ) | |
| # Load initial data | |
| demo.load( | |
| fn=get_recommendations, | |
| inputs=[], | |
| outputs=[user_history_table, similar_users_table, recommended_movies_table] | |
| ) | |
| # Search functionality | |
| #search_btn.click( | |
| # fn=update_recommendations, | |
| # inputs=[search_box], | |
| # outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text] | |
| #) | |
| # Clear filter functionality | |
| # clear_btn.click( | |
| # fn=lambda: update_recommendations(None), | |
| # inputs=[], | |
| # outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text] | |
| #) | |
| # Reset history functionality | |
| reset_btn.click( | |
| fn=reset_history, | |
| inputs=[], | |
| outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text] | |
| ) | |
| # Dropdown change event - triggers when selecting from dropdown | |
| search_box.select( | |
| fn=update_recommendations, | |
| inputs=[search_box], | |
| outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text] | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| **Algorithm Details:** | |
| - **Collaborative Filtering:** Finds users with similar rating patterns using cosine similarity | |
| - **Content-Based Filtering:** Matches movies by genre similarity using TF-IDF vectorization | |
| - **Hybrid Score:** Weighted combination (60% collaborative, 40% content-based) | |
| **How to Use:** | |
| 1. View your watch history above (starts with 5 movies) | |
| 2. Use the dropdown to search and select movies | |
| 3. Click "Add to History" or select from dropdown to add movie to your history (rated 4.0 by default) | |
| 4. Watch your recommendations update in real-time based on your new preferences! | |
| 5. Click "Reset History" to return to the original 5 movies | |
| **Note:** Adding movies to your history will immediately update your recommendations based on your expanded taste profile. | |
| """ | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) |