Spaces:

prince4332
/

RecommendationSystem

Sleeping

File size: 17,946 Bytes

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import gradio as gr

# ============================================================================
# STEP 1: DATA SIMULATION & USER SETUP
# ============================================================================

# Fixed User ID - This user does not exist in the original dataset
TARGET_USER_ID = 9999

# Create synthetic user history (5 movies with high ratings)
initial_user_history = pd.DataFrame({
    'userId': [TARGET_USER_ID] * 5,
    'movieId': [1, 2, 3, 4, 5],
    'rating': [5.0, 4.5, 5.0, 4.0, 4.5],
    'timestamp': [1609459200] * 5,  # Fixed timestamp
    'title': [
        'Toy Story (1995)',
        'The Matrix (1999)',
        'Inception (2010)',
        'The Lion King (1994)',
        'Interstellar (2014)'
    ],
    'genres': [
        'Animation|Children|Comedy',
        'Action|Sci-Fi|Thriller',
        'Action|Sci-Fi|Thriller',
        'Animation|Children|Drama|Musical',
        'Adventure|Drama|Sci-Fi'
    ]
})

# Global variable to track dynamic user history
synthetic_user_history = initial_user_history.copy()

# Load the new combined sampled data
combined_sampled_data = pd.read_csv('./sampled_movie_ratings_for_gradio.csv')

# Extract raw_ratings_df from the combined data
raw_ratings_df = combined_sampled_data[['userId', 'movieId', 'rating', 'timestamp']].copy()

# Extract full_movies_metadata_df and movies_db from the combined data
# Ensure unique movies are taken for metadata purposes
full_movies_metadata_df = combined_sampled_data[['movieId', 'title', 'genres']].drop_duplicates(subset=['movieId']).copy()
movies_db = full_movies_metadata_df[['movieId', 'title', 'genres']].copy()

# ratings_df will be the base for all user ratings (mock + synthetic)
ratings_df = raw_ratings_df[['userId', 'movieId', 'rating']].copy()

# Get list of all movie titles for autocomplete
all_movie_titles = movies_db['title'].tolist()

# Create a mock ratings dataset (simulate other users rating movies)
np.random.seed(42)
n_users = 100
n_movies = len(movies_db) # Use the number of movies in our metadata DB

mock_ratings = []
for user_id in range(1, n_users + 1):
    # Each user rates 5-15 random movies
    n_ratings = np.random.randint(5, 16)
    # Sample movieId from movies_db to ensure consistency
    movie_ids = np.random.choice(movies_db['movieId'].values, n_ratings, replace=False)
    ratings = np.random.uniform(2.5, 5.0, n_ratings)

    for movie_id, rating in zip(movie_ids, ratings):
        mock_ratings.append({
            'userId': user_id,
            'movieId': movie_id,
            'rating': round(rating, 1)
        })

# Append mock ratings to the initial ratings_df
ratings_df = pd.concat([ratings_df, pd.DataFrame(mock_ratings)], ignore_index=True)

# Global variable for movie_id to positional index mapping
movie_id_to_idx = {}

# Function to rebuild recommendation models based on current history
def rebuild_models():
    global user_item_matrix, user_similarity_df, content_similarity, movie_id_to_idx

    # Merge synthetic user into the ratings dataset
    all_ratings = pd.concat([
        ratings_df,
        synthetic_user_history[['userId', 'movieId', 'rating']]
    ], ignore_index=True)

    # Merge with movies_db to get full data (ratings + movie metadata)
    # Use a left merge to preserve all ratings from all_ratings
    full_data = pd.merge(all_ratings, movies_db, on='movieId', how='left')

    # ============================================================================
    # STEP 2: BUILD RECOMMENDATION MODELS
    # ============================================================================

    # Create user-item matrix for collaborative filtering
    user_item_matrix = full_data.pivot_table(
        index='userId',
        columns='movieId',
        values='rating' # 'rating' column is now unambiguously from all_ratings
    ).fillna(0)

    # Calculate user-user similarity matrix
    user_similarity = cosine_similarity(user_item_matrix)
    user_similarity_df = pd.DataFrame(
        user_similarity,
        index=user_item_matrix.index,
        columns=user_item_matrix.index
    )

    # Create content-based filtering using TF-IDF on genres
    tfidf = TfidfVectorizer(token_pattern=r'[^|]+')
    tfidf_matrix = tfidf.fit_transform(movies_db['genres'])
    content_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Create a mapping from movieId to its 0-based positional index in movies_db
    movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movies_db['movieId'])}

# Initialize models
rebuild_models()

# ============================================================================
# STEP 3: RECOMMENDATION FUNCTIONS
# ============================================================================

def get_similar_users(user_id, n=5):
    """
    Get top N most similar users based on collaborative filtering.
    Returns DataFrame with similar_user_id and similarity_score.
    """
    if user_id not in user_similarity_df.index:
        return pd.DataFrame(columns=['Similar User ID', 'Similarity Score'])

    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n+1]

    result_df = pd.DataFrame({
        'Similar User ID': similar_users.index,
        'Similarity Score': [round(score, 4) for score in similar_users.values]
    })

    return result_df

def get_top_movies(user_id, search_query=None, n=15, alpha=0.6):
    """
    Get top N recommended movies using hybrid filtering.

    Args:
        user_id: Target user ID
        search_query: Optional search filter (movie title)
        n: Number of recommendations
        alpha: Weight for collaborative filtering (1-alpha for content-based)

    Returns DataFrame with Movie Title, Genre, and Estimated Score.
    """
    if user_id not in user_item_matrix.index:
        return pd.DataFrame(columns=['Movie Title', 'Genre', 'Estimated Score'])

    # Get movies the user hasn't rated
    user_ratings = user_item_matrix.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index.tolist()

    # Calculate scores for unrated movies
    movie_scores = []

    for movie_id in unrated_movies:
        # Collaborative filtering score
        similar_users = get_similar_users(user_id, n=10)
        collab_score = 0

        if len(similar_users) > 0:
            for _, row in similar_users.iterrows():
                sim_user_id = row['Similar User ID']
                similarity = row['Similarity Score']
                if sim_user_id in user_item_matrix.index:
                    rating = user_item_matrix.loc[sim_user_id, movie_id]
                    if rating > 0:
                        collab_score += similarity * rating

            collab_score = collab_score / (similar_users['Similarity Score'].sum() + 1e-10)

        # Content-based score
        user_rated_movies = user_ratings[user_ratings > 0].index.tolist()
        content_score = 0

        if len(user_rated_movies) > 0:
            # Find the positional index of the movie in movies_db for content_similarity
            # Ensure movie_id exists in movies_db before proceeding
            if movie_id in movie_id_to_idx:
                movie_idx = movie_id_to_idx[movie_id]

                for rated_movie_id in user_rated_movies:
                    # Ensure rated_movie_id exists in movies_db before proceeding
                    if rated_movie_id in movie_id_to_idx:
                        rated_idx = movie_id_to_idx[rated_movie_id]
                        # Ensure indices are within bounds of content_similarity
                        if rated_idx < content_similarity.shape[1] and movie_idx < content_similarity.shape[0]:
                            content_score += content_similarity[movie_idx, rated_idx]
                        else:
                            # Handle cases where index might still be out of bounds due to data inconsistencies
                            # This could happen if movies_db was somehow out of sync with content_similarity
                            print(f"Warning: Content similarity index out of bounds for movie_id={movie_id} or rated_movie_id={rated_movie_id}")

                content_score = content_score / len(user_rated_movies)

        # Hybrid score
        hybrid_score = alpha * (collab_score / 5.0) + (1 - alpha) * content_score

        # Get movie details from movies_db (metadata only)
        # Ensure movie_id exists in movies_db before proceeding
        if movie_id in movies_db['movieId'].values:
            movie_info = movies_db[movies_db['movieId'] == movie_id].iloc[0]

            movie_scores.append({
                'Movie Title': movie_info['title'],
                'Genre': movie_info['genres'],
                'Estimated Score': round(hybrid_score * 5, 2)  # Scale to 0-5 range
            })

    # Sort by score
    result_df = pd.DataFrame(movie_scores).sort_values(
        'Estimated Score', ascending=False
    )

    # Apply search filter if provided (exact match or partial)
    if search_query and search_query.strip():
        # Check if it's an exact match from autocomplete
        if search_query in movies_db['title'].values:
            result_df = result_df[result_df['Movie Title'] == search_query]
        else:
            # Fallback to partial match
            query = search_query.lower()
            result_df = result_df[
                result_df['Movie Title'].str.lower().str.contains(query, na=False)
            ]

    result_df = result_df.head(n)
    return result_df.reset_index(drop=True)

def get_recommendations(search_query=None):
    """
    Main function to get all recommendations for the target user.
    Returns: user_history_df, similar_users_df, recommended_movies_df
    """
    # Get similar users
    similar_users = get_similar_users(TARGET_USER_ID, n=5)

    # Get recommended movies
    recommended_movies = get_top_movies(TARGET_USER_ID, search_query=search_query, n=15)

    # Ensure we have data to display
    if len(recommended_movies) == 0:
        # If no results, show top recommendations without filter
        recommended_movies = get_top_movies(TARGET_USER_ID, search_query=None, n=15)

    return synthetic_user_history, similar_users, recommended_movies

def add_movie_to_history(movie_title):
    """
    Add a searched movie to user's history with a default rating.
    Rebuilds recommendation models after adding.
    """
    global synthetic_user_history

    if not movie_title or not movie_title.strip():
        return

    # Check if movie exists in database
    movie_match = movies_db[movies_db['title'] == movie_title]
    if len(movie_match) == 0:
        return

    # Check if movie is already in user's history
    if movie_title in synthetic_user_history['title'].values:
        return

    # Get movie details
    movie_info = movie_match.iloc[0]

    # Create new history entry with a default rating of 4.0
    new_entry = pd.DataFrame({
        'userId': [TARGET_USER_ID],
        'movieId': [movie_info['movieId']],
        'rating': [4.0],
        'timestamp': [1609459200],
        'title': [movie_info['title']],
        'genres': [movie_info['genres']]
    })

    # Add to history
    synthetic_user_history = pd.concat([synthetic_user_history, new_entry], ignore_index=True)

    # Rebuild recommendation models
    rebuild_models()

# ============================================================================
# STEP 4: GRADIO INTERFACE
# ============================================================================

def update_recommendations(search_query):
    """Update recommendations based on search query."""
    # If search query is provided and is a valid movie title from our database
    if search_query and search_query.strip() and search_query in all_movie_titles:
        # Add movie to history
        add_movie_to_history(search_query)

        # Get updated recommendations
        user_history, similar_users, recommended_movies = get_recommendations(None)
        status_msg = f"✅ Added '{search_query}' to your watch history! Recommendations updated."

        return user_history, similar_users, recommended_movies, status_msg
    elif search_query and search_query.strip():
        # If it's a custom search term (partial match)
        user_history, similar_users, recommended_movies = get_recommendations(search_query)
        if len(recommended_movies) == 0:
            status_msg = f"⚠️ No recommendations found for '{search_query}'. Showing all recommendations."
            user_history, similar_users, recommended_movies = get_recommendations(None)
        else:
            status_msg = f"🔍 Showing recommendations matching: {search_query}"

        return user_history, similar_users, recommended_movies, status_msg
    else:
        # No search query - show all recommendations
        user_history, similar_users, recommended_movies = get_recommendations(None)
        status_msg = "📋 Showing all recommendations"

        return user_history, similar_users, recommended_movies, status_msg

def reset_history():
    """
    Reset user history to initial state.
    """
    global synthetic_user_history
    synthetic_user_history = initial_user_history.copy()
    rebuild_models()
    user_history, similar_users, recommended_movies = get_recommendations(None)
    return user_history, similar_users, recommended_movies, "🔄 History reset to initial state"

# Build the Gradio interface
with gr.Blocks(title="Movie Recommendation System") as demo:

    gr.Markdown(
        f"""
        # 🎬 Movie Recommendation System
        ### Personalized Recommendations for User #{TARGET_USER_ID}

        This system uses **hybrid filtering** (collaborative + content-based) to recommend movies.
        **Select a movie from the dropdown to add it to your watch history and get updated recommendations!**
        """
    )

    # Section 1: User Profile
    with gr.Row():
        gr.Markdown("## 👤 Your Watch History")

    with gr.Row():
        user_history_table = gr.Dataframe(
            value=synthetic_user_history,
            label="Rated Movies",
            interactive=False,
            wrap=True
        )

    # Section 2: Search & Control
    with gr.Row():
        gr.Markdown("## 🔍 Search & Add Movies to History")

    with gr.Row():
        search_box = gr.Dropdown(
            choices=all_movie_titles,
            label="Search Movies (Autocomplete)",
            info="Type to search, then select a movie to add it to your history and update recommendations",
            allow_custom_value=True,
            interactive=True
        )
       # search_btn = gr.Button("➕ Add to History", variant="primary", scale=0)
        #clear_btn = gr.Button("🔄 Show All", variant="secondary", scale=0)
        reset_btn = gr.Button("↺ Reset History", variant="secondary", scale=0)

    with gr.Row():
        status_text = gr.Markdown("📋 Showing all recommendations")

    # Section 3: Results
    gr.Markdown("## 📊 Recommendation Results")

    with gr.Row():
        # Table A: Similar Users
        with gr.Column(scale=1):
            gr.Markdown("### Similar Users")
            gr.Markdown("*Users with similar taste in movies*")
            similar_users_table = gr.Dataframe(
                label="",
                interactive=False,
                wrap=True
            )

        # Table B: Recommended Movies
        with gr.Column(scale=1):
            gr.Markdown("### Recommended Movies")
            gr.Markdown("*Top picks based on your preferences*")
            recommended_movies_table = gr.Dataframe(
                label="",
                interactive=False,
                wrap=True
            )

    # Load initial data
    demo.load(
        fn=get_recommendations,
        inputs=[],
        outputs=[user_history_table, similar_users_table, recommended_movies_table]
    )

    # Search functionality
    #search_btn.click(
     #   fn=update_recommendations,
      #  inputs=[search_box],
       # outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
    #)

    # Clear filter functionality
   # clear_btn.click(
    #    fn=lambda: update_recommendations(None),
     #   inputs=[],
      #  outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
    #)

    # Reset history functionality
    reset_btn.click(
        fn=reset_history,
        inputs=[],
        outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
    )

    # Dropdown change event - triggers when selecting from dropdown
    search_box.select(
        fn=update_recommendations,
        inputs=[search_box],
        outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
    )

    gr.Markdown(
        """
        ---
        **Algorithm Details:**
        - **Collaborative Filtering:** Finds users with similar rating patterns using cosine similarity
        - **Content-Based Filtering:** Matches movies by genre similarity using TF-IDF vectorization
        - **Hybrid Score:** Weighted combination (60% collaborative, 40% content-based)

        **How to Use:**
        1. View your watch history above (starts with 5 movies)
        2. Use the dropdown to search and select movies
        3. Click "Add to History" or select from dropdown to add movie to your history (rated 4.0 by default)
        4. Watch your recommendations update in real-time based on your new preferences!
        5. Click "Reset History" to return to the original 5 movies

        **Note:** Adding movies to your history will immediately update your recommendations based on your expanded taste profile.
        """
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(debug=True)