prince4332's picture
Update app.py
c88e583 verified
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import gradio as gr
# ============================================================================
# STEP 1: DATA SIMULATION & USER SETUP
# ============================================================================
# Fixed User ID - This user does not exist in the original dataset
TARGET_USER_ID = 9999
# Create synthetic user history (5 movies with high ratings)
initial_user_history = pd.DataFrame({
'userId': [TARGET_USER_ID] * 5,
'movieId': [1, 2, 3, 4, 5],
'rating': [5.0, 4.5, 5.0, 4.0, 4.5],
'timestamp': [1609459200] * 5, # Fixed timestamp
'title': [
'Toy Story (1995)',
'The Matrix (1999)',
'Inception (2010)',
'The Lion King (1994)',
'Interstellar (2014)'
],
'genres': [
'Animation|Children|Comedy',
'Action|Sci-Fi|Thriller',
'Action|Sci-Fi|Thriller',
'Animation|Children|Drama|Musical',
'Adventure|Drama|Sci-Fi'
]
})
# Global variable to track dynamic user history
synthetic_user_history = initial_user_history.copy()
# Load the new combined sampled data
combined_sampled_data = pd.read_csv('./sampled_movie_ratings_for_gradio.csv')
# Extract raw_ratings_df from the combined data
raw_ratings_df = combined_sampled_data[['userId', 'movieId', 'rating', 'timestamp']].copy()
# Extract full_movies_metadata_df and movies_db from the combined data
# Ensure unique movies are taken for metadata purposes
full_movies_metadata_df = combined_sampled_data[['movieId', 'title', 'genres']].drop_duplicates(subset=['movieId']).copy()
movies_db = full_movies_metadata_df[['movieId', 'title', 'genres']].copy()
# ratings_df will be the base for all user ratings (mock + synthetic)
ratings_df = raw_ratings_df[['userId', 'movieId', 'rating']].copy()
# Get list of all movie titles for autocomplete
all_movie_titles = movies_db['title'].tolist()
# Create a mock ratings dataset (simulate other users rating movies)
np.random.seed(42)
n_users = 100
n_movies = len(movies_db) # Use the number of movies in our metadata DB
mock_ratings = []
for user_id in range(1, n_users + 1):
# Each user rates 5-15 random movies
n_ratings = np.random.randint(5, 16)
# Sample movieId from movies_db to ensure consistency
movie_ids = np.random.choice(movies_db['movieId'].values, n_ratings, replace=False)
ratings = np.random.uniform(2.5, 5.0, n_ratings)
for movie_id, rating in zip(movie_ids, ratings):
mock_ratings.append({
'userId': user_id,
'movieId': movie_id,
'rating': round(rating, 1)
})
# Append mock ratings to the initial ratings_df
ratings_df = pd.concat([ratings_df, pd.DataFrame(mock_ratings)], ignore_index=True)
# Global variable for movie_id to positional index mapping
movie_id_to_idx = {}
# Function to rebuild recommendation models based on current history
def rebuild_models():
global user_item_matrix, user_similarity_df, content_similarity, movie_id_to_idx
# Merge synthetic user into the ratings dataset
all_ratings = pd.concat([
ratings_df,
synthetic_user_history[['userId', 'movieId', 'rating']]
], ignore_index=True)
# Merge with movies_db to get full data (ratings + movie metadata)
# Use a left merge to preserve all ratings from all_ratings
full_data = pd.merge(all_ratings, movies_db, on='movieId', how='left')
# ============================================================================
# STEP 2: BUILD RECOMMENDATION MODELS
# ============================================================================
# Create user-item matrix for collaborative filtering
user_item_matrix = full_data.pivot_table(
index='userId',
columns='movieId',
values='rating' # 'rating' column is now unambiguously from all_ratings
).fillna(0)
# Calculate user-user similarity matrix
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(
user_similarity,
index=user_item_matrix.index,
columns=user_item_matrix.index
)
# Create content-based filtering using TF-IDF on genres
tfidf = TfidfVectorizer(token_pattern=r'[^|]+')
tfidf_matrix = tfidf.fit_transform(movies_db['genres'])
content_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Create a mapping from movieId to its 0-based positional index in movies_db
movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movies_db['movieId'])}
# Initialize models
rebuild_models()
# ============================================================================
# STEP 3: RECOMMENDATION FUNCTIONS
# ============================================================================
def get_similar_users(user_id, n=5):
"""
Get top N most similar users based on collaborative filtering.
Returns DataFrame with similar_user_id and similarity_score.
"""
if user_id not in user_similarity_df.index:
return pd.DataFrame(columns=['Similar User ID', 'Similarity Score'])
similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n+1]
result_df = pd.DataFrame({
'Similar User ID': similar_users.index,
'Similarity Score': [round(score, 4) for score in similar_users.values]
})
return result_df
def get_top_movies(user_id, search_query=None, n=15, alpha=0.6):
"""
Get top N recommended movies using hybrid filtering.
Args:
user_id: Target user ID
search_query: Optional search filter (movie title)
n: Number of recommendations
alpha: Weight for collaborative filtering (1-alpha for content-based)
Returns DataFrame with Movie Title, Genre, and Estimated Score.
"""
if user_id not in user_item_matrix.index:
return pd.DataFrame(columns=['Movie Title', 'Genre', 'Estimated Score'])
# Get movies the user hasn't rated
user_ratings = user_item_matrix.loc[user_id]
unrated_movies = user_ratings[user_ratings == 0].index.tolist()
# Calculate scores for unrated movies
movie_scores = []
for movie_id in unrated_movies:
# Collaborative filtering score
similar_users = get_similar_users(user_id, n=10)
collab_score = 0
if len(similar_users) > 0:
for _, row in similar_users.iterrows():
sim_user_id = row['Similar User ID']
similarity = row['Similarity Score']
if sim_user_id in user_item_matrix.index:
rating = user_item_matrix.loc[sim_user_id, movie_id]
if rating > 0:
collab_score += similarity * rating
collab_score = collab_score / (similar_users['Similarity Score'].sum() + 1e-10)
# Content-based score
user_rated_movies = user_ratings[user_ratings > 0].index.tolist()
content_score = 0
if len(user_rated_movies) > 0:
# Find the positional index of the movie in movies_db for content_similarity
# Ensure movie_id exists in movies_db before proceeding
if movie_id in movie_id_to_idx:
movie_idx = movie_id_to_idx[movie_id]
for rated_movie_id in user_rated_movies:
# Ensure rated_movie_id exists in movies_db before proceeding
if rated_movie_id in movie_id_to_idx:
rated_idx = movie_id_to_idx[rated_movie_id]
# Ensure indices are within bounds of content_similarity
if rated_idx < content_similarity.shape[1] and movie_idx < content_similarity.shape[0]:
content_score += content_similarity[movie_idx, rated_idx]
else:
# Handle cases where index might still be out of bounds due to data inconsistencies
# This could happen if movies_db was somehow out of sync with content_similarity
print(f"Warning: Content similarity index out of bounds for movie_id={movie_id} or rated_movie_id={rated_movie_id}")
content_score = content_score / len(user_rated_movies)
# Hybrid score
hybrid_score = alpha * (collab_score / 5.0) + (1 - alpha) * content_score
# Get movie details from movies_db (metadata only)
# Ensure movie_id exists in movies_db before proceeding
if movie_id in movies_db['movieId'].values:
movie_info = movies_db[movies_db['movieId'] == movie_id].iloc[0]
movie_scores.append({
'Movie Title': movie_info['title'],
'Genre': movie_info['genres'],
'Estimated Score': round(hybrid_score * 5, 2) # Scale to 0-5 range
})
# Sort by score
result_df = pd.DataFrame(movie_scores).sort_values(
'Estimated Score', ascending=False
)
# Apply search filter if provided (exact match or partial)
if search_query and search_query.strip():
# Check if it's an exact match from autocomplete
if search_query in movies_db['title'].values:
result_df = result_df[result_df['Movie Title'] == search_query]
else:
# Fallback to partial match
query = search_query.lower()
result_df = result_df[
result_df['Movie Title'].str.lower().str.contains(query, na=False)
]
result_df = result_df.head(n)
return result_df.reset_index(drop=True)
def get_recommendations(search_query=None):
"""
Main function to get all recommendations for the target user.
Returns: user_history_df, similar_users_df, recommended_movies_df
"""
# Get similar users
similar_users = get_similar_users(TARGET_USER_ID, n=5)
# Get recommended movies
recommended_movies = get_top_movies(TARGET_USER_ID, search_query=search_query, n=15)
# Ensure we have data to display
if len(recommended_movies) == 0:
# If no results, show top recommendations without filter
recommended_movies = get_top_movies(TARGET_USER_ID, search_query=None, n=15)
return synthetic_user_history, similar_users, recommended_movies
def add_movie_to_history(movie_title):
"""
Add a searched movie to user's history with a default rating.
Rebuilds recommendation models after adding.
"""
global synthetic_user_history
if not movie_title or not movie_title.strip():
return
# Check if movie exists in database
movie_match = movies_db[movies_db['title'] == movie_title]
if len(movie_match) == 0:
return
# Check if movie is already in user's history
if movie_title in synthetic_user_history['title'].values:
return
# Get movie details
movie_info = movie_match.iloc[0]
# Create new history entry with a default rating of 4.0
new_entry = pd.DataFrame({
'userId': [TARGET_USER_ID],
'movieId': [movie_info['movieId']],
'rating': [4.0],
'timestamp': [1609459200],
'title': [movie_info['title']],
'genres': [movie_info['genres']]
})
# Add to history
synthetic_user_history = pd.concat([synthetic_user_history, new_entry], ignore_index=True)
# Rebuild recommendation models
rebuild_models()
# ============================================================================
# STEP 4: GRADIO INTERFACE
# ============================================================================
def update_recommendations(search_query):
"""Update recommendations based on search query."""
# If search query is provided and is a valid movie title from our database
if search_query and search_query.strip() and search_query in all_movie_titles:
# Add movie to history
add_movie_to_history(search_query)
# Get updated recommendations
user_history, similar_users, recommended_movies = get_recommendations(None)
status_msg = f"βœ… Added '{search_query}' to your watch history! Recommendations updated."
return user_history, similar_users, recommended_movies, status_msg
elif search_query and search_query.strip():
# If it's a custom search term (partial match)
user_history, similar_users, recommended_movies = get_recommendations(search_query)
if len(recommended_movies) == 0:
status_msg = f"⚠️ No recommendations found for '{search_query}'. Showing all recommendations."
user_history, similar_users, recommended_movies = get_recommendations(None)
else:
status_msg = f"πŸ” Showing recommendations matching: {search_query}"
return user_history, similar_users, recommended_movies, status_msg
else:
# No search query - show all recommendations
user_history, similar_users, recommended_movies = get_recommendations(None)
status_msg = "πŸ“‹ Showing all recommendations"
return user_history, similar_users, recommended_movies, status_msg
def reset_history():
"""
Reset user history to initial state.
"""
global synthetic_user_history
synthetic_user_history = initial_user_history.copy()
rebuild_models()
user_history, similar_users, recommended_movies = get_recommendations(None)
return user_history, similar_users, recommended_movies, "πŸ”„ History reset to initial state"
# Build the Gradio interface
with gr.Blocks(title="Movie Recommendation System") as demo:
gr.Markdown(
f"""
# 🎬 Movie Recommendation System
### Personalized Recommendations for User #{TARGET_USER_ID}
This system uses **hybrid filtering** (collaborative + content-based) to recommend movies.
**Select a movie from the dropdown to add it to your watch history and get updated recommendations!**
"""
)
# Section 1: User Profile
with gr.Row():
gr.Markdown("## πŸ‘€ Your Watch History")
with gr.Row():
user_history_table = gr.Dataframe(
value=synthetic_user_history,
label="Rated Movies",
interactive=False,
wrap=True
)
# Section 2: Search & Control
with gr.Row():
gr.Markdown("## πŸ” Search & Add Movies to History")
with gr.Row():
search_box = gr.Dropdown(
choices=all_movie_titles,
label="Search Movies (Autocomplete)",
info="Type to search, then select a movie to add it to your history and update recommendations",
allow_custom_value=True,
interactive=True
)
# search_btn = gr.Button("βž• Add to History", variant="primary", scale=0)
#clear_btn = gr.Button("πŸ”„ Show All", variant="secondary", scale=0)
reset_btn = gr.Button("β†Ί Reset History", variant="secondary", scale=0)
with gr.Row():
status_text = gr.Markdown("πŸ“‹ Showing all recommendations")
# Section 3: Results
gr.Markdown("## πŸ“Š Recommendation Results")
with gr.Row():
# Table A: Similar Users
with gr.Column(scale=1):
gr.Markdown("### Similar Users")
gr.Markdown("*Users with similar taste in movies*")
similar_users_table = gr.Dataframe(
label="",
interactive=False,
wrap=True
)
# Table B: Recommended Movies
with gr.Column(scale=1):
gr.Markdown("### Recommended Movies")
gr.Markdown("*Top picks based on your preferences*")
recommended_movies_table = gr.Dataframe(
label="",
interactive=False,
wrap=True
)
# Load initial data
demo.load(
fn=get_recommendations,
inputs=[],
outputs=[user_history_table, similar_users_table, recommended_movies_table]
)
# Search functionality
#search_btn.click(
# fn=update_recommendations,
# inputs=[search_box],
# outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
#)
# Clear filter functionality
# clear_btn.click(
# fn=lambda: update_recommendations(None),
# inputs=[],
# outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
#)
# Reset history functionality
reset_btn.click(
fn=reset_history,
inputs=[],
outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
)
# Dropdown change event - triggers when selecting from dropdown
search_box.select(
fn=update_recommendations,
inputs=[search_box],
outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
)
gr.Markdown(
"""
---
**Algorithm Details:**
- **Collaborative Filtering:** Finds users with similar rating patterns using cosine similarity
- **Content-Based Filtering:** Matches movies by genre similarity using TF-IDF vectorization
- **Hybrid Score:** Weighted combination (60% collaborative, 40% content-based)
**How to Use:**
1. View your watch history above (starts with 5 movies)
2. Use the dropdown to search and select movies
3. Click "Add to History" or select from dropdown to add movie to your history (rated 4.0 by default)
4. Watch your recommendations update in real-time based on your new preferences!
5. Click "Reset History" to return to the original 5 movies
**Note:** Adding movies to your history will immediately update your recommendations based on your expanded taste profile.
"""
)
# Launch the app
if __name__ == "__main__":
demo.launch(debug=True)