File size: 17,946 Bytes
d3e0558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e356b06
d3e0558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e356b06
 
 
d3e0558
 
e356b06
d3e0558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e356b06
 
 
d3e0558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e356b06
 
d3e0558
 
 
e356b06
 
 
 
 
 
 
 
 
d3e0558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c88e583
 
d3e0558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c88e583
 
 
 
 
d3e0558
 
c88e583
 
 
 
 
d3e0558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import gradio as gr

# ============================================================================
# STEP 1: DATA SIMULATION & USER SETUP
# ============================================================================

# Fixed User ID - This user does not exist in the original dataset
TARGET_USER_ID = 9999

# Create synthetic user history (5 movies with high ratings)
initial_user_history = pd.DataFrame({
    'userId': [TARGET_USER_ID] * 5,
    'movieId': [1, 2, 3, 4, 5],
    'rating': [5.0, 4.5, 5.0, 4.0, 4.5],
    'timestamp': [1609459200] * 5,  # Fixed timestamp
    'title': [
        'Toy Story (1995)',
        'The Matrix (1999)',
        'Inception (2010)',
        'The Lion King (1994)',
        'Interstellar (2014)'
    ],
    'genres': [
        'Animation|Children|Comedy',
        'Action|Sci-Fi|Thriller',
        'Action|Sci-Fi|Thriller',
        'Animation|Children|Drama|Musical',
        'Adventure|Drama|Sci-Fi'
    ]
})

# Global variable to track dynamic user history
synthetic_user_history = initial_user_history.copy()

# Load the new combined sampled data
combined_sampled_data = pd.read_csv('./sampled_movie_ratings_for_gradio.csv')

# Extract raw_ratings_df from the combined data
raw_ratings_df = combined_sampled_data[['userId', 'movieId', 'rating', 'timestamp']].copy()

# Extract full_movies_metadata_df and movies_db from the combined data
# Ensure unique movies are taken for metadata purposes
full_movies_metadata_df = combined_sampled_data[['movieId', 'title', 'genres']].drop_duplicates(subset=['movieId']).copy()
movies_db = full_movies_metadata_df[['movieId', 'title', 'genres']].copy()

# ratings_df will be the base for all user ratings (mock + synthetic)
ratings_df = raw_ratings_df[['userId', 'movieId', 'rating']].copy()

# Get list of all movie titles for autocomplete
all_movie_titles = movies_db['title'].tolist()

# Create a mock ratings dataset (simulate other users rating movies)
np.random.seed(42)
n_users = 100
n_movies = len(movies_db) # Use the number of movies in our metadata DB

mock_ratings = []
for user_id in range(1, n_users + 1):
    # Each user rates 5-15 random movies
    n_ratings = np.random.randint(5, 16)
    # Sample movieId from movies_db to ensure consistency
    movie_ids = np.random.choice(movies_db['movieId'].values, n_ratings, replace=False)
    ratings = np.random.uniform(2.5, 5.0, n_ratings)

    for movie_id, rating in zip(movie_ids, ratings):
        mock_ratings.append({
            'userId': user_id,
            'movieId': movie_id,
            'rating': round(rating, 1)
        })

# Append mock ratings to the initial ratings_df
ratings_df = pd.concat([ratings_df, pd.DataFrame(mock_ratings)], ignore_index=True)

# Global variable for movie_id to positional index mapping
movie_id_to_idx = {}

# Function to rebuild recommendation models based on current history
def rebuild_models():
    global user_item_matrix, user_similarity_df, content_similarity, movie_id_to_idx

    # Merge synthetic user into the ratings dataset
    all_ratings = pd.concat([
        ratings_df,
        synthetic_user_history[['userId', 'movieId', 'rating']]
    ], ignore_index=True)

    # Merge with movies_db to get full data (ratings + movie metadata)
    # Use a left merge to preserve all ratings from all_ratings
    full_data = pd.merge(all_ratings, movies_db, on='movieId', how='left')

    # ============================================================================
    # STEP 2: BUILD RECOMMENDATION MODELS
    # ============================================================================

    # Create user-item matrix for collaborative filtering
    user_item_matrix = full_data.pivot_table(
        index='userId',
        columns='movieId',
        values='rating' # 'rating' column is now unambiguously from all_ratings
    ).fillna(0)

    # Calculate user-user similarity matrix
    user_similarity = cosine_similarity(user_item_matrix)
    user_similarity_df = pd.DataFrame(
        user_similarity,
        index=user_item_matrix.index,
        columns=user_item_matrix.index
    )

    # Create content-based filtering using TF-IDF on genres
    tfidf = TfidfVectorizer(token_pattern=r'[^|]+')
    tfidf_matrix = tfidf.fit_transform(movies_db['genres'])
    content_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Create a mapping from movieId to its 0-based positional index in movies_db
    movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movies_db['movieId'])}

# Initialize models
rebuild_models()

# ============================================================================
# STEP 3: RECOMMENDATION FUNCTIONS
# ============================================================================

def get_similar_users(user_id, n=5):
    """
    Get top N most similar users based on collaborative filtering.
    Returns DataFrame with similar_user_id and similarity_score.
    """
    if user_id not in user_similarity_df.index:
        return pd.DataFrame(columns=['Similar User ID', 'Similarity Score'])

    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n+1]

    result_df = pd.DataFrame({
        'Similar User ID': similar_users.index,
        'Similarity Score': [round(score, 4) for score in similar_users.values]
    })

    return result_df

def get_top_movies(user_id, search_query=None, n=15, alpha=0.6):
    """
    Get top N recommended movies using hybrid filtering.

    Args:
        user_id: Target user ID
        search_query: Optional search filter (movie title)
        n: Number of recommendations
        alpha: Weight for collaborative filtering (1-alpha for content-based)

    Returns DataFrame with Movie Title, Genre, and Estimated Score.
    """
    if user_id not in user_item_matrix.index:
        return pd.DataFrame(columns=['Movie Title', 'Genre', 'Estimated Score'])

    # Get movies the user hasn't rated
    user_ratings = user_item_matrix.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index.tolist()

    # Calculate scores for unrated movies
    movie_scores = []

    for movie_id in unrated_movies:
        # Collaborative filtering score
        similar_users = get_similar_users(user_id, n=10)
        collab_score = 0

        if len(similar_users) > 0:
            for _, row in similar_users.iterrows():
                sim_user_id = row['Similar User ID']
                similarity = row['Similarity Score']
                if sim_user_id in user_item_matrix.index:
                    rating = user_item_matrix.loc[sim_user_id, movie_id]
                    if rating > 0:
                        collab_score += similarity * rating

            collab_score = collab_score / (similar_users['Similarity Score'].sum() + 1e-10)

        # Content-based score
        user_rated_movies = user_ratings[user_ratings > 0].index.tolist()
        content_score = 0

        if len(user_rated_movies) > 0:
            # Find the positional index of the movie in movies_db for content_similarity
            # Ensure movie_id exists in movies_db before proceeding
            if movie_id in movie_id_to_idx:
                movie_idx = movie_id_to_idx[movie_id]

                for rated_movie_id in user_rated_movies:
                    # Ensure rated_movie_id exists in movies_db before proceeding
                    if rated_movie_id in movie_id_to_idx:
                        rated_idx = movie_id_to_idx[rated_movie_id]
                        # Ensure indices are within bounds of content_similarity
                        if rated_idx < content_similarity.shape[1] and movie_idx < content_similarity.shape[0]:
                            content_score += content_similarity[movie_idx, rated_idx]
                        else:
                            # Handle cases where index might still be out of bounds due to data inconsistencies
                            # This could happen if movies_db was somehow out of sync with content_similarity
                            print(f"Warning: Content similarity index out of bounds for movie_id={movie_id} or rated_movie_id={rated_movie_id}")

                content_score = content_score / len(user_rated_movies)

        # Hybrid score
        hybrid_score = alpha * (collab_score / 5.0) + (1 - alpha) * content_score

        # Get movie details from movies_db (metadata only)
        # Ensure movie_id exists in movies_db before proceeding
        if movie_id in movies_db['movieId'].values:
            movie_info = movies_db[movies_db['movieId'] == movie_id].iloc[0]

            movie_scores.append({
                'Movie Title': movie_info['title'],
                'Genre': movie_info['genres'],
                'Estimated Score': round(hybrid_score * 5, 2)  # Scale to 0-5 range
            })

    # Sort by score
    result_df = pd.DataFrame(movie_scores).sort_values(
        'Estimated Score', ascending=False
    )

    # Apply search filter if provided (exact match or partial)
    if search_query and search_query.strip():
        # Check if it's an exact match from autocomplete
        if search_query in movies_db['title'].values:
            result_df = result_df[result_df['Movie Title'] == search_query]
        else:
            # Fallback to partial match
            query = search_query.lower()
            result_df = result_df[
                result_df['Movie Title'].str.lower().str.contains(query, na=False)
            ]

    result_df = result_df.head(n)
    return result_df.reset_index(drop=True)

def get_recommendations(search_query=None):
    """
    Main function to get all recommendations for the target user.
    Returns: user_history_df, similar_users_df, recommended_movies_df
    """
    # Get similar users
    similar_users = get_similar_users(TARGET_USER_ID, n=5)

    # Get recommended movies
    recommended_movies = get_top_movies(TARGET_USER_ID, search_query=search_query, n=15)

    # Ensure we have data to display
    if len(recommended_movies) == 0:
        # If no results, show top recommendations without filter
        recommended_movies = get_top_movies(TARGET_USER_ID, search_query=None, n=15)

    return synthetic_user_history, similar_users, recommended_movies

def add_movie_to_history(movie_title):
    """
    Add a searched movie to user's history with a default rating.
    Rebuilds recommendation models after adding.
    """
    global synthetic_user_history

    if not movie_title or not movie_title.strip():
        return

    # Check if movie exists in database
    movie_match = movies_db[movies_db['title'] == movie_title]
    if len(movie_match) == 0:
        return

    # Check if movie is already in user's history
    if movie_title in synthetic_user_history['title'].values:
        return

    # Get movie details
    movie_info = movie_match.iloc[0]

    # Create new history entry with a default rating of 4.0
    new_entry = pd.DataFrame({
        'userId': [TARGET_USER_ID],
        'movieId': [movie_info['movieId']],
        'rating': [4.0],
        'timestamp': [1609459200],
        'title': [movie_info['title']],
        'genres': [movie_info['genres']]
    })

    # Add to history
    synthetic_user_history = pd.concat([synthetic_user_history, new_entry], ignore_index=True)

    # Rebuild recommendation models
    rebuild_models()

# ============================================================================
# STEP 4: GRADIO INTERFACE
# ============================================================================

def update_recommendations(search_query):
    """Update recommendations based on search query."""
    # If search query is provided and is a valid movie title from our database
    if search_query and search_query.strip() and search_query in all_movie_titles:
        # Add movie to history
        add_movie_to_history(search_query)

        # Get updated recommendations
        user_history, similar_users, recommended_movies = get_recommendations(None)
        status_msg = f"βœ… Added '{search_query}' to your watch history! Recommendations updated."

        return user_history, similar_users, recommended_movies, status_msg
    elif search_query and search_query.strip():
        # If it's a custom search term (partial match)
        user_history, similar_users, recommended_movies = get_recommendations(search_query)
        if len(recommended_movies) == 0:
            status_msg = f"⚠️ No recommendations found for '{search_query}'. Showing all recommendations."
            user_history, similar_users, recommended_movies = get_recommendations(None)
        else:
            status_msg = f"πŸ” Showing recommendations matching: {search_query}"

        return user_history, similar_users, recommended_movies, status_msg
    else:
        # No search query - show all recommendations
        user_history, similar_users, recommended_movies = get_recommendations(None)
        status_msg = "πŸ“‹ Showing all recommendations"

        return user_history, similar_users, recommended_movies, status_msg

def reset_history():
    """
    Reset user history to initial state.
    """
    global synthetic_user_history
    synthetic_user_history = initial_user_history.copy()
    rebuild_models()
    user_history, similar_users, recommended_movies = get_recommendations(None)
    return user_history, similar_users, recommended_movies, "πŸ”„ History reset to initial state"

# Build the Gradio interface
with gr.Blocks(title="Movie Recommendation System") as demo:

    gr.Markdown(
        f"""
        # 🎬 Movie Recommendation System
        ### Personalized Recommendations for User #{TARGET_USER_ID}

        This system uses **hybrid filtering** (collaborative + content-based) to recommend movies.
        **Select a movie from the dropdown to add it to your watch history and get updated recommendations!**
        """
    )

    # Section 1: User Profile
    with gr.Row():
        gr.Markdown("## πŸ‘€ Your Watch History")

    with gr.Row():
        user_history_table = gr.Dataframe(
            value=synthetic_user_history,
            label="Rated Movies",
            interactive=False,
            wrap=True
        )

    # Section 2: Search & Control
    with gr.Row():
        gr.Markdown("## πŸ” Search & Add Movies to History")

    with gr.Row():
        search_box = gr.Dropdown(
            choices=all_movie_titles,
            label="Search Movies (Autocomplete)",
            info="Type to search, then select a movie to add it to your history and update recommendations",
            allow_custom_value=True,
            interactive=True
        )
       # search_btn = gr.Button("βž• Add to History", variant="primary", scale=0)
        #clear_btn = gr.Button("πŸ”„ Show All", variant="secondary", scale=0)
        reset_btn = gr.Button("β†Ί Reset History", variant="secondary", scale=0)

    with gr.Row():
        status_text = gr.Markdown("πŸ“‹ Showing all recommendations")

    # Section 3: Results
    gr.Markdown("## πŸ“Š Recommendation Results")

    with gr.Row():
        # Table A: Similar Users
        with gr.Column(scale=1):
            gr.Markdown("### Similar Users")
            gr.Markdown("*Users with similar taste in movies*")
            similar_users_table = gr.Dataframe(
                label="",
                interactive=False,
                wrap=True
            )

        # Table B: Recommended Movies
        with gr.Column(scale=1):
            gr.Markdown("### Recommended Movies")
            gr.Markdown("*Top picks based on your preferences*")
            recommended_movies_table = gr.Dataframe(
                label="",
                interactive=False,
                wrap=True
            )

    # Load initial data
    demo.load(
        fn=get_recommendations,
        inputs=[],
        outputs=[user_history_table, similar_users_table, recommended_movies_table]
    )

    # Search functionality
    #search_btn.click(
     #   fn=update_recommendations,
      #  inputs=[search_box],
       # outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
    #)

    # Clear filter functionality
   # clear_btn.click(
    #    fn=lambda: update_recommendations(None),
     #   inputs=[],
      #  outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
    #)

    # Reset history functionality
    reset_btn.click(
        fn=reset_history,
        inputs=[],
        outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
    )

    # Dropdown change event - triggers when selecting from dropdown
    search_box.select(
        fn=update_recommendations,
        inputs=[search_box],
        outputs=[user_history_table, similar_users_table, recommended_movies_table, status_text]
    )

    gr.Markdown(
        """
        ---
        **Algorithm Details:**
        - **Collaborative Filtering:** Finds users with similar rating patterns using cosine similarity
        - **Content-Based Filtering:** Matches movies by genre similarity using TF-IDF vectorization
        - **Hybrid Score:** Weighted combination (60% collaborative, 40% content-based)

        **How to Use:**
        1. View your watch history above (starts with 5 movies)
        2. Use the dropdown to search and select movies
        3. Click "Add to History" or select from dropdown to add movie to your history (rated 4.0 by default)
        4. Watch your recommendations update in real-time based on your new preferences!
        5. Click "Reset History" to return to the original 5 movies

        **Note:** Adding movies to your history will immediately update your recommendations based on your expanded taste profile.
        """
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(debug=True)