Spaces:

prince4332
/

RecommendationSystem

Sleeping

App Files Files Community

RecommendationSystem / app.py

prince4332

Update app.py

c88e583 verified about 2 months ago

raw

history blame contribute delete

17.9 kB

	import pandas as pd
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.feature_extraction.text import TfidfVectorizer
	import gradio as gr

	# ============================================================================
	# STEP 1: DATA SIMULATION & USER SETUP
	# ============================================================================

	# Fixed User ID - This user does not exist in the original dataset
	TARGET_USER_ID = 9999

	# Create synthetic user history (5 movies with high ratings)
	initial_user_history = pd.DataFrame({
	'userId': [TARGET_USER_ID] * 5,
	'movieId': [1, 2, 3, 4, 5],
	'rating': [5.0, 4.5, 5.0, 4.0, 4.5],
	'timestamp': [1609459200] * 5, # Fixed timestamp
	'title': [
	'Toy Story (1995)',
	'The Matrix (1999)',
	'Inception (2010)',
	'The Lion King (1994)',
	'Interstellar (2014)'
	],
	'genres': [
	'Animation\|Children\|Comedy',
	'Action\|Sci-Fi\|Thriller',
	'Action\|Sci-Fi\|Thriller',
	'Animation\|Children\|Drama\|Musical',
	'Adventure\|Drama\|Sci-Fi'
	]
	})

	# Global variable to track dynamic user history
	synthetic_user_history = initial_user_history.copy()

	# Load the new combined sampled data
	combined_sampled_data = pd.read_csv('./sampled_movie_ratings_for_gradio.csv')

	# Extract raw_ratings_df from the combined data
	raw_ratings_df = combined_sampled_data[['userId', 'movieId', 'rating', 'timestamp']].copy()

	# Extract full_movies_metadata_df and movies_db from the combined data
	# Ensure unique movies are taken for metadata purposes
	full_movies_metadata_df = combined_sampled_data[['movieId', 'title', 'genres']].drop_duplicates(subset=['movieId']).copy()
	movies_db = full_movies_metadata_df[['movieId', 'title', 'genres']].copy()

	# ratings_df will be the base for all user ratings (mock + synthetic)
	ratings_df = raw_ratings_df[['userId', 'movieId', 'rating']].copy()

	# Get list of all movie titles for autocomplete
	all_movie_titles = movies_db['title'].tolist()

	# Create a mock ratings dataset (simulate other users rating movies)
	np.random.seed(42)
	n_users = 100
	n_movies = len(movies_db) # Use the number of movies in our metadata DB

	mock_ratings = []
	for user_id in range(1, n_users + 1):
	# Each user rates 5-15 random movies
	n_ratings = np.random.randint(5, 16)
	# Sample movieId from movies_db to ensure consistency
	movie_ids = np.random.choice(movies_db['movieId'].values, n_ratings, replace=False)
	ratings = np.random.uniform(2.5, 5.0, n_ratings)

	for movie_id, rating in zip(movie_ids, ratings):
	mock_ratings.append({
	'userId': user_id,
	'movieId': movie_id,
	'rating': round(rating, 1)
	})

	# Append mock ratings to the initial ratings_df
	ratings_df = pd.concat([ratings_df, pd.DataFrame(mock_ratings)], ignore_index=True)

	# Global variable for movie_id to positional index mapping
	movie_id_to_idx = {}

	# Function to rebuild recommendation models based on current history
	def rebuild_models():
	global user_item_matrix, user_similarity_df, content_similarity, movie_id_to_idx

	# Merge synthetic user into the ratings dataset
	all_ratings = pd.concat([
	ratings_df,
	synthetic_user_history[['userId', 'movieId', 'rating']]
	], ignore_index=True)

	# Merge with movies_db to get full data (ratings + movie metadata)
	# Use a left merge to preserve all ratings from all_ratings
	full_data = pd.merge(all_ratings, movies_db, on='movieId', how='left')

	# ============================================================================
	# STEP 2: BUILD RECOMMENDATION MODELS
	# ============================================================================

	# Create user-item matrix for collaborative filtering
	user_item_matrix = full_data.pivot_table(
	index='userId',
	columns='movieId',
	values='rating' # 'rating' column is now unambiguously from all_ratings
	).fillna(0)

	# Calculate user-user similarity matrix
	user_similarity = cosine_similarity(user_item_matrix)
	user_similarity_df = pd.DataFrame(
	user_similarity,
	index=user_item_matrix.index,
	columns=user_item_matrix.index
	)

	# Create content-based filtering using TF-IDF on genres
	tfidf = TfidfVectorizer(token_pattern=r'[^\|]+')
	tfidf_matrix = tfidf.fit_transform(movies_db['genres'])
	content_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

	# Create a mapping from movieId to its 0-based positional index in movies_db
	movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movies_db['movieId'])}

	# Initialize models
	rebuild_models()

	# ============================================================================
	# STEP 3: RECOMMENDATION FUNCTIONS
	# ============================================================================

	def get_similar_users(user_id, n=5):
	"""
	Get top N most similar users based on collaborative filtering.
	Returns DataFrame with similar_user_id and similarity_score.
	"""
	if user_id not in user_similarity_df.index:
	return pd.DataFrame(columns=['Similar User ID', 'Similarity Score'])

	similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n+1]

	result_df = pd.DataFrame({
	'Similar User ID': similar_users.index,
	'Similarity Score': [round(score, 4) for score in similar_users.values]
	})

	return result_df

	def get_top_movies(user_id, search_query=None, n=15, alpha=0.6):
	"""
	Get top N recommended movies using hybrid filtering.

	Args:
	user_id: Target user ID
	search_query: Optional search filter (movie title)
	n: Number of recommendations
	alpha: Weight for collaborative filtering (1-alpha for content-based)

	Returns DataFrame with Movie Title, Genre, and Estimated Score.
	"""
	if user_id not in user_item_matrix.index:
	return pd.DataFrame(columns=['Movie Title', 'Genre', 'Estimated Score'])

	# Get movies the user hasn't rated
	user_ratings = user_item_matrix.loc[user_id]
	unrated_movies = user_ratings[user_ratings == 0].index.tolist()

	# Calculate scores for unrated movies
	movie_scores = []

	for movie_id in unrated_movies:
	# Collaborative filtering score
	similar_users = get_similar_users(user_id, n=10)
	collab_score = 0

	if len(similar_users) > 0:
	for _, row in similar_users.iterrows():
	sim_user_id = row['Similar User ID']
	similarity = row['Similarity Score']
	if sim_user_id in user_item_matrix.index:
	rating = user_item_matrix.loc[sim_user_id, movie_id]
	if rating > 0:
	collab_score += similarity * rating

	collab_score = collab_score / (similar_users['Similarity Score'].sum() + 1e-10)

	# Content-based score
	user_rated_movies = user_ratings[user_ratings > 0].index.tolist()
	content_score = 0

	if len(user_rated_movies) > 0:
	# Find the positional index of the movie in movies_db for content_similarity
	# Ensure movie_id exists in movies_db before proceeding
	if movie_id in movie_id_to_idx:
	movie_idx = movie_id_to_idx[movie_id]

	for rated_movie_id in user_rated_movies:
	# Ensure rated_movie_id exists in movies_db before proceeding
	if rated_movie_id in movie_id_to_idx:
	rated_idx = movie_id_to_idx[rated_movie_id]
	# Ensure indices are within bounds of content_similarity
	if rated_idx < content_similarity.shape[1] and movie_idx < content_similarity.shape[0]:
	content_score += content_similarity[movie_idx, rated_idx]
	else:
	# Handle cases where index might still be out of bounds due to data inconsistencies
	# This could happen if movies_db was somehow out of sync with content_similarity
	print(f"Warning: Content similarity index out of bounds for movie_id={movie_id} or rated_movie_id={rated_movie_id}")

	content_score = content_score / len(user_rated_movies)

	# Hybrid score
	hybrid_score = alpha * (collab_score / 5.0) + (1 - alpha) * content_score

	# Get movie details from movies_db (metadata only)
	# Ensure movie_id exists in movies_db before proceeding
	if movie_id in movies_db['movieId'].values:
	movie_info = movies_db[movies_db['movieId'] == movie_id].iloc[0]

	movie_scores.append({
	'Movie Title': movie_info['title'],
	'Genre': movie_info['genres'],
	'Estimated Score': round(hybrid_score * 5, 2) # Scale to 0-5 range
	})

	# Sort by score
	result_df = pd.DataFrame(movie_scores).sort_values(
	'Estimated Score', ascending=False
	)

	# Apply search filter if provided (exact match or partial)
	if search_query and search_query.strip():
	# Check if it's an exact match from autocomplete
	if search_query in movies_db['title'].values:
	result_df = result_df[result_df['Movie Title'] == search_query]
	else:
	# Fallback to partial match
	query = search_query.lower()
	result_df = result_df[
	result_df['Movie Title'].str.lower().str.contains(query, na=False)
	]

	result_df = result_df.head(n)
	return result_df.reset_index(drop=True)

	def get_recommendations(search_query=None):
	"""
	Main function to get all recommendations for the target user.
	Returns: user_history_df, similar_users_df, recommended_movies_df
	"""
	# Get similar users
	similar_users = get_similar_users(TARGET_USER_ID, n=5)

	# Get recommended movies
	recommended_movies = get_top_movies(TARGET_USER_ID, search_query=search_query, n=15)

	# Ensure we have data to display
	if len(recommended_movies) == 0:
	# If no results, show top recommendations without filter
	recommended_movies = get_top_movies(TARGET_USER_ID, search_query=None, n=15)

	return synthetic_user_history, similar_users, recommended_movies

	def add_movie_to_history(movie_title):
	"""
	Add a searched movie to user's history with a default rating.
	Rebuilds recommendation models after adding.
	"""
	global synthetic_user_history

	if not movie_title or not movie_title.strip():
	return

	# Check if movie exists in database
	movie_match = movies_db[movies_db['title'] == movie_title]
	if len(movie_match) == 0:
	return

	# Check if movie is already in user's history
	if movie_title in synthetic_user_history['title'].values:
	return

	# Get movie details
	movie_info = movie_match.iloc[0]

	# Create new history entry with a default rating of 4.0
	new_entry = pd.DataFrame({
	'userId': [TARGET_USER_ID],
	'movieId': [movie_info['movieId']],
	'rating': [4.0],
	'timestamp': [1609459200],
	'title': [movie_info['title']],
	'genres': [movie_info['genres']]
	})

	# Add to history
	synthetic_user_history = pd.concat([synthetic_user_history, new_entry], ignore_index=True)

	# Rebuild recommendation models
	rebuild_models()

	# ============================================================================
	# STEP 4: GRADIO INTERFACE
	# ============================================================================

	def update_recommendations(search_query):
	"""Update recommendations based on search query."""
	# If search query is provided and is a valid movie title from our database
	if search_query and search_query.strip() and search_query in all_movie_titles:
	# Add movie to history
	add_movie_to_history(search_query)

	# Get updated recommendations
	user_history, similar_users, recommended_movies = get_recommendations(None)
	status_msg = f"✅ Added '{search_query}' to your watch history! Recommendations updated."

	return user_history, similar_users, recommended_movies, status_msg
	elif search_query and search_query.strip():
	# If it's a custom search term (partial match)
	user_history, similar_users, recommended_movies = get_recommendations(search_query)
	if len(recommended_movies) == 0:
	status_msg = f"⚠️ No recommendations found for '{search_query}'. Showing all recommendations."
	user_history, similar_users, recommended_movies = get_recommendations(None)
	else:
	status_msg = f"🔍 Showing recommendations matching: {search_query}"

	return user_history, similar_users, recommended_movies, status_msg
	else:
	# No search query - show all recommendations
	user_history, similar_users, recommended_movies = get_recommendations(None)
	status_msg = "📋 Showing all recommendations"

	return user_history, similar_users, recommended_movies, status_msg

	def reset_history():
	"""
	Reset user history to initial state.
	"""
	global synthetic_user_history
	synthetic_user_history = initial_user_history.copy()
	rebuild_models()
	user_history, similar_users, recommended_movies = get_recommendations(None)
	return user_history, similar_users, recommended_movies, "🔄 History reset to initial state"

	# Build the Gradio interface
	with gr.Blocks(title="Movie Recommendation System") as demo:

	gr.Markdown(
	f"""
	# 🎬 Movie Recommendation System
	### Personalized Recommendations for User #{TARGET_USER_ID}

	This system uses hybrid filtering (collaborative + content-based) to recommend movies.
	Select a movie from the dropdown to add it to your watch history and get updated recommendations!
	"""
	)

	# Section 1: User Profile
	with gr.Row():
	gr.Markdown("## 👤 Your Watch History")

	with gr.Row():
	user_history_table = gr.Dataframe(
	value=synthetic_user_history,
	label="Rated Movies",
	interactive=False,
	wrap=True
	)

	# Section 2: Search & Control
	with gr.Row():
	gr.Markdown("## 🔍 Search & Add Movies to History")

	with gr.Row():
	search_box = gr.Dropdown(
	choices=all_movie_titles,
	label="Search Movies (Autocomplete)",
	info="Type to search, then select a movie to add it to your history and update recommendations",
	allow_custom_value=True,
	interactive=True
	)
	# search_btn = gr.Button("➕ Add to History", variant="primary", scale=0)
	#clear_btn = gr.Button("🔄 Show All", variant="secondary", scale=0)
	reset_btn = gr.Button("↺ Reset History", variant="secondary", scale=0)

	with gr.Row():
	status_text = gr.Markdown("📋 Showing all recommendations")

	# Section 3: Results
	gr.Markdown("## 📊 Recommendation Results")

	with gr.Row():
	# Table A: Similar Users
	with gr.Column(scale=1):
	gr.Markdown("### Similar Users")
	gr.Markdown("Users with similar taste in movies")
	similar_users_table = gr.Dataframe(
	label="",
	interactive=False,
	wrap=True
	)

	# Table B: Recommended Movies
	with gr.Column(scale=1):
	gr.Markdown("### Recommended Movies")
	gr.Markdown("Top picks based on your preferences")
	recommended_movies_table = gr.Dataframe(
	label="",
	interactive=False,
	wrap=True
	)

	# Load initial data
	demo.load(
	fn=get_recommendations,
	inputs=[],
	outputs=[user_history_table, similar_users_table, recommended_movies_table]
	)

	# Search functionality
	#search_btn.click(
	# fn=update_recommendations,
	# inputs=[search_box],
	# outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
	#)

	# Clear filter functionality
	# clear_btn.click(
	# fn=lambda: update_recommendations(None),
	# inputs=[],
	# outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
	#)

	# Reset history functionality
	reset_btn.click(
	fn=reset_history,
	inputs=[],
	outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
	)

	# Dropdown change event - triggers when selecting from dropdown
	search_box.select(
	fn=update_recommendations,
	inputs=[search_box],
	outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
	)

	gr.Markdown(
	"""
	---
	Algorithm Details:
	- Collaborative Filtering: Finds users with similar rating patterns using cosine similarity
	- Content-Based Filtering: Matches movies by genre similarity using TF-IDF vectorization
	- Hybrid Score: Weighted combination (60% collaborative, 40% content-based)

	How to Use:
	1. View your watch history above (starts with 5 movies)
	2. Use the dropdown to search and select movies
	3. Click "Add to History" or select from dropdown to add movie to your history (rated 4.0 by default)
	4. Watch your recommendations update in real-time based on your new preferences!
	5. Click "Reset History" to return to the original 5 movies

	Note: Adding movies to your history will immediately update your recommendations based on your expanded taste profile.
	"""
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(debug=True)