shariarshaikat702 commited on
Commit
4431fbe
·
verified ·
1 Parent(s): d11c75d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +431 -0
app.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from scipy.sparse.linalg import svds
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from collections import Counter
9
+
10
+ # Global variables for models
11
+ movies = None
12
+ ratings = None
13
+ users = None
14
+ train_user_item_matrix = None
15
+ user_similarity_df = None
16
+ svd_predicted_ratings = None
17
+ alpha = 0.6
18
+ models_loaded = False
19
+
20
+ def load_datasets():
21
+ """Load CSV datasets with multiple encoding support"""
22
+ global movies, ratings, users
23
+
24
+ try:
25
+ encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
26
+ delimiters = [',', '::', '\t', '|', ';']
27
+
28
+ movies = None
29
+ ratings = None
30
+ users = None
31
+
32
+ # Load movies
33
+ for enc in encodings:
34
+ for delim in delimiters:
35
+ try:
36
+ movies = pd.read_csv('movies.csv', encoding=enc, sep=delim,
37
+ engine='python', on_bad_lines='skip')
38
+ if len(movies.columns) >= 2:
39
+ break
40
+ except:
41
+ continue
42
+ if movies is not None and len(movies.columns) >= 2:
43
+ break
44
+
45
+ # Load ratings
46
+ for delim in delimiters:
47
+ try:
48
+ ratings = pd.read_csv('ratings.csv', sep=delim, engine='python',
49
+ on_bad_lines='skip')
50
+ if len(ratings.columns) >= 3:
51
+ break
52
+ except:
53
+ continue
54
+
55
+ # Load users
56
+ for delim in delimiters:
57
+ try:
58
+ users = pd.read_csv('users.csv', sep=delim, engine='python',
59
+ on_bad_lines='skip')
60
+ if len(users.columns) >= 2:
61
+ break
62
+ except:
63
+ continue
64
+
65
+ if movies is None or ratings is None or users is None:
66
+ return "Failed to load datasets. Check file formats."
67
+
68
+ # Normalize column names
69
+ movies.columns = movies.columns.str.strip().str.lower()
70
+ ratings.columns = ratings.columns.str.strip().str.lower()
71
+ users.columns = users.columns.str.strip().str.lower()
72
+
73
+ if 'genres' in movies.columns:
74
+ movies['genres'] = movies['genres'].fillna('Unknown')
75
+
76
+ return f"Loaded: {len(movies)} movies, {len(ratings)} ratings, {len(users)} users"
77
+
78
+ except Exception as e:
79
+ return f"Error: {str(e)}"
80
+
81
+ def train_models():
82
+ """Train recommendation models"""
83
+ global train_user_item_matrix, user_similarity_df, svd_predicted_ratings, models_loaded
84
+
85
+ if movies is None or ratings is None:
86
+ return "Please load datasets first!"
87
+
88
+ try:
89
+ # Create train split
90
+ train_data = []
91
+ for user_id in ratings['userid'].unique():
92
+ user_ratings = ratings[ratings['userid'] == user_id]
93
+ if 'timestamp' in ratings.columns:
94
+ user_ratings = user_ratings.sort_values('timestamp')
95
+ n_ratings = len(user_ratings)
96
+ if n_ratings >= 5:
97
+ split_idx = int(n_ratings * 0.8)
98
+ train_data.append(user_ratings.iloc[:split_idx])
99
+
100
+ train_ratings = pd.concat(train_data, ignore_index=True)
101
+
102
+ # Create user-item matrix
103
+ train_user_item_matrix = train_ratings.pivot_table(
104
+ index='userid',
105
+ columns='movieid',
106
+ values='rating'
107
+ ).fillna(0)
108
+
109
+ # Train User-Based CF
110
+ user_similarity = cosine_similarity(train_user_item_matrix)
111
+ user_similarity_df = pd.DataFrame(
112
+ user_similarity,
113
+ index=train_user_item_matrix.index,
114
+ columns=train_user_item_matrix.index
115
+ )
116
+
117
+ # Train SVD
118
+ n_factors = min(100, min(train_user_item_matrix.shape) - 1)
119
+ R = train_user_item_matrix.values
120
+ user_ratings_mean = np.mean(R, axis=1)
121
+ R_demeaned = R - user_ratings_mean.reshape(-1, 1)
122
+
123
+ U, sigma, Vt = svds(R_demeaned, k=n_factors)
124
+ sigma = np.diag(sigma)
125
+ predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
126
+
127
+ svd_predicted_ratings = pd.DataFrame(
128
+ predicted_ratings,
129
+ index=train_user_item_matrix.index,
130
+ columns=train_user_item_matrix.columns
131
+ )
132
+
133
+ models_loaded = True
134
+ return "Models trained successfully!"
135
+
136
+ except Exception as e:
137
+ return f"Error training models: {str(e)}"
138
+
139
+ def load_and_train():
140
+ """Load datasets and train models"""
141
+ msg1 = load_datasets()
142
+ if "Loaded:" not in msg1:
143
+ return msg1, None, None
144
+
145
+ msg2 = train_models()
146
+
147
+ # Get dataset stats
148
+ stats_html = f"""
149
+ <div style='background: #f0f2f6; padding: 20px; border-radius: 10px; margin: 10px 0;'>
150
+ <h3 style='color: #FF4B4B; margin-bottom: 15px;'>Dataset Statistics</h3>
151
+ <div style='display: grid; grid-template-columns: repeat(4, 1fr); gap: 15px;'>
152
+ <div style='background: white; padding: 15px; border-radius: 8px; text-align: center;'>
153
+ <div style='font-size: 24px; font-weight: bold; color: #FF4B4B;'>{len(movies):,}</div>
154
+ <div style='color: #666; font-size: 14px;'>Movies</div>
155
+ </div>
156
+ <div style='background: white; padding: 15px; border-radius: 8px; text-align: center;'>
157
+ <div style='font-size: 24px; font-weight: bold; color: #FF4B4B;'>{len(users):,}</div>
158
+ <div style='color: #666; font-size: 14px;'>Users</div>
159
+ </div>
160
+ <div style='background: white; padding: 15px; border-radius: 8px; text-align: center;'>
161
+ <div style='font-size: 24px; font-weight: bold; color: #FF4B4B;'>{len(ratings):,}</div>
162
+ <div style='color: #666; font-size: 14px;'>Ratings</div>
163
+ </div>
164
+ <div style='background: white; padding: 15px; border-radius: 8px; text-align: center;'>
165
+ <div style='font-size: 24px; font-weight: bold; color: #FF4B4B;'>{ratings['rating'].mean():.2f}</div>
166
+ <div style='color: #666; font-size: 14px;'>Avg Rating</div>
167
+ </div>
168
+ </div>
169
+ </div>
170
+ """
171
+
172
+ # Create rating distribution chart
173
+ rating_dist = ratings['rating'].value_counts().sort_index()
174
+ fig = px.bar(x=rating_dist.index, y=rating_dist.values,
175
+ labels={'x': 'Rating', 'y': 'Count'},
176
+ title='Rating Distribution',
177
+ color=rating_dist.values,
178
+ color_continuous_scale='Viridis')
179
+
180
+ return f"{msg1}\n{msg2}", stats_html, fig
181
+
182
+ def recommend_movies(user_id, num_recommendations):
183
+ """Generate movie recommendations"""
184
+ if not models_loaded:
185
+ return "Please load and train models first!", None, None
186
+
187
+ try:
188
+ user_id = int(user_id)
189
+ num_recommendations = int(num_recommendations)
190
+
191
+ if user_id not in train_user_item_matrix.index:
192
+ return f"User {user_id} not found in training data", None, None
193
+
194
+ # CF recommendations
195
+ similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:51]
196
+ user_ratings = train_user_item_matrix.loc[user_id]
197
+ watched_movies = user_ratings[user_ratings > 0].index
198
+
199
+ cf_recommendations = {}
200
+ for sim_user, similarity in similar_users.items():
201
+ sim_user_ratings = train_user_item_matrix.loc[sim_user]
202
+ for movie_id, rating in sim_user_ratings.items():
203
+ if rating > 0 and movie_id not in watched_movies:
204
+ if movie_id not in cf_recommendations:
205
+ cf_recommendations[movie_id] = 0
206
+ cf_recommendations[movie_id] += similarity * rating
207
+
208
+ cf_top = sorted(cf_recommendations.items(), key=lambda x: x[1], reverse=True)[:num_recommendations*2]
209
+ cf_movies = [movie_id for movie_id, _ in cf_top]
210
+
211
+ # SVD recommendations
212
+ user_pred_ratings = svd_predicted_ratings.loc[user_id]
213
+ unwatched_predictions = user_pred_ratings.drop(watched_movies)
214
+ svd_movies = unwatched_predictions.sort_values(ascending=False).head(num_recommendations*2).index.tolist()
215
+
216
+ # Combine
217
+ combined_scores = {}
218
+ for i, movie_id in enumerate(cf_movies):
219
+ combined_scores[movie_id] = combined_scores.get(movie_id, 0) + alpha * (len(cf_movies) - i)
220
+
221
+ for i, movie_id in enumerate(svd_movies):
222
+ combined_scores[movie_id] = combined_scores.get(movie_id, 0) + (1 - alpha) * (len(svd_movies) - i)
223
+
224
+ top_movies = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:num_recommendations]
225
+ movie_ids = [movie_id for movie_id, _ in top_movies]
226
+
227
+ # Get movie details
228
+ recommendations = []
229
+ for i, movie_id in enumerate(movie_ids, 1):
230
+ movie_info = movies[movies['movieid'] == movie_id]
231
+ if not movie_info.empty:
232
+ title = movie_info.iloc[0]['title']
233
+ genres = movie_info.iloc[0].get('genres', 'Unknown')
234
+ recommendations.append({
235
+ 'Rank': i,
236
+ 'Title': title,
237
+ 'Genres': genres
238
+ })
239
+
240
+ # Create HTML output
241
+ html_output = f"""
242
+ <div style='background: #f8f9fa; padding: 20px; border-radius: 10px;'>
243
+ <h2 style='color: #FF4B4B; margin-bottom: 20px;'>Top {num_recommendations} Recommendations for User {user_id}</h2>
244
+ """
245
+
246
+ for rec in recommendations:
247
+ html_output += f"""
248
+ <div style='background: white; padding: 15px; margin: 10px 0; border-radius: 8px; border-left: 4px solid #FF4B4B;'>
249
+ <h3 style='color: #1f1f1f; margin: 0 0 10px 0;'>{rec['Rank']}. {rec['Title']}</h3>
250
+ <p style='color: #666; margin: 0;'><strong>Genres:</strong> {rec['Genres']}</p>
251
+ </div>
252
+ """
253
+
254
+ html_output += "</div>"
255
+
256
+ # Create visualizations
257
+ user_ratings_data = ratings[ratings['userid'] == user_id]
258
+
259
+ # Rating distribution
260
+ rating_dist = user_ratings_data['rating'].value_counts().sort_index()
261
+ fig1 = px.bar(x=rating_dist.index, y=rating_dist.values,
262
+ labels={'x': 'Rating', 'y': 'Count'},
263
+ title=f'User {user_id} Rating Distribution',
264
+ color=rating_dist.values,
265
+ color_continuous_scale='Blues')
266
+
267
+ # Genre preferences
268
+ user_movies = user_ratings_data.merge(movies[['movieid', 'genres']], on='movieid')
269
+ genres_list = []
270
+ for genres in user_movies['genres']:
271
+ if pd.notna(genres) and genres != 'Unknown':
272
+ genres_list.extend(genres.split('|'))
273
+
274
+ if genres_list:
275
+ genre_counts = Counter(genres_list)
276
+ top_genres = dict(sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:8])
277
+ fig2 = px.pie(values=list(top_genres.values()), names=list(top_genres.keys()),
278
+ title=f'User {user_id} Genre Preferences',
279
+ color_discrete_sequence=px.colors.qualitative.Set3)
280
+ else:
281
+ fig2 = None
282
+
283
+ return html_output, fig1, fig2
284
+
285
+ except Exception as e:
286
+ return f"Error: {str(e)}", None, None
287
+
288
+ def get_dataset_insights():
289
+ """Generate dataset insights"""
290
+ if movies is None or ratings is None:
291
+ return "Please load datasets first!", None, None
292
+
293
+ # Genre analysis
294
+ all_genres = []
295
+ for genres in movies['genres']:
296
+ if pd.notna(genres) and genres != 'Unknown':
297
+ all_genres.extend(genres.split('|'))
298
+
299
+ genre_counts = Counter(all_genres)
300
+ top_genres = dict(sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:15])
301
+
302
+ fig1 = px.bar(x=list(top_genres.values()), y=list(top_genres.keys()),
303
+ orientation='h',
304
+ labels={'x': 'Number of Movies', 'y': 'Genre'},
305
+ title='Top 15 Genres by Movie Count',
306
+ color=list(top_genres.values()),
307
+ color_continuous_scale='Teal')
308
+
309
+ # User activity
310
+ user_activity = ratings.groupby('userid').size()
311
+ fig2 = px.histogram(user_activity, nbins=50,
312
+ labels={'value': 'Number of Ratings', 'count': 'Number of Users'},
313
+ title='User Activity Distribution',
314
+ color_discrete_sequence=['coral'])
315
+
316
+ stats = f"""
317
+ <div style='background: #f0f2f6; padding: 20px; border-radius: 10px;'>
318
+ <h3 style='color: #FF4B4B;'>Insights</h3>
319
+ <p><strong>Most Popular Genre:</strong> {list(top_genres.keys())[0]}</p>
320
+ <p><strong>Average User Activity:</strong> {user_activity.mean():.1f} ratings</p>
321
+ <p><strong>Most Active User:</strong> {user_activity.max()} ratings</p>
322
+ <p><strong>Total Unique Movies Rated:</strong> {ratings['movieid'].nunique()}</p>
323
+ </div>
324
+ """
325
+
326
+ return stats, fig1, fig2
327
+
328
+ # Create Gradio Interface
329
+ with gr.Blocks(title="DataSynthis Movie Recommender", theme=gr.themes.Soft()) as app:
330
+
331
+ gr.Markdown("""
332
+ # DataSynthis Movie Recommendation System
333
+ ### Powered by Hybrid Collaborative Filtering & Matrix Factorization
334
+ """)
335
+
336
+ with gr.Tabs():
337
+
338
+ # Tab 1: Setup
339
+ with gr.Tab("Setup & Load Data"):
340
+ gr.Markdown("### Step 1: Load Datasets and Train Models")
341
+ gr.Markdown("Click the button below to load your CSV files and train the recommendation models.")
342
+
343
+ load_btn = gr.Button("Load Datasets & Train Models", variant="primary", size="lg")
344
+ status_output = gr.Textbox(label="Status", lines=2)
345
+ stats_output = gr.HTML(label="Dataset Statistics")
346
+ chart_output = gr.Plot(label="Rating Distribution")
347
+
348
+ load_btn.click(
349
+ fn=load_and_train,
350
+ outputs=[status_output, stats_output, chart_output]
351
+ )
352
+
353
+ # Tab 2: Recommendations
354
+ with gr.Tab("Get Recommendations"):
355
+ gr.Markdown("### Generate Personalized Movie Recommendations")
356
+
357
+ with gr.Row():
358
+ with gr.Column(scale=2):
359
+ user_id_input = gr.Number(label="Enter User ID", value=1, precision=0)
360
+ with gr.Column(scale=1):
361
+ num_recs_input = gr.Slider(minimum=5, maximum=20, value=10, step=1,
362
+ label="Number of Recommendations")
363
+
364
+ recommend_btn = gr.Button("Generate Recommendations", variant="primary", size="lg")
365
+
366
+ recommendations_output = gr.HTML(label="Recommendations")
367
+
368
+ with gr.Row():
369
+ rating_chart = gr.Plot(label="User Rating Distribution")
370
+ genre_chart = gr.Plot(label="Genre Preferences")
371
+
372
+ recommend_btn.click(
373
+ fn=recommend_movies,
374
+ inputs=[user_id_input, num_recs_input],
375
+ outputs=[recommendations_output, rating_chart, genre_chart]
376
+ )
377
+
378
+ # Tab 3: Insights
379
+ with gr.Tab("Dataset Insights"):
380
+ gr.Markdown("### Explore Dataset Analytics")
381
+
382
+ insights_btn = gr.Button("Generate Insights", variant="primary")
383
+ insights_stats = gr.HTML(label="Statistics")
384
+
385
+ with gr.Row():
386
+ genre_plot = gr.Plot(label="Popular Genres")
387
+ activity_plot = gr.Plot(label="User Activity")
388
+
389
+ insights_btn.click(
390
+ fn=get_dataset_insights,
391
+ outputs=[insights_stats, genre_plot, activity_plot]
392
+ )
393
+
394
+ # Tab 4: About
395
+ with gr.Tab("About"):
396
+ gr.Markdown("""
397
+ ## DataSynthis Movie Recommendation System
398
+
399
+ This intelligent recommendation system uses advanced machine learning algorithms to provide
400
+ personalized movie suggestions based on user preferences and viewing history.
401
+
402
+ ### Features:
403
+ - **Hybrid Approach**: Combines User-Based Collaborative Filtering and SVD Matrix Factorization
404
+ - **High Accuracy**: Trained on comprehensive movie rating datasets
405
+ - **Real-Time Predictions**: Instant recommendations for any user
406
+ - **Interactive Visualizations**: Understand user behavior and preferences
407
+
408
+ ### Algorithms Used:
409
+ 1. **User-Based Collaborative Filtering**: Finds similar users and recommends movies they enjoyed
410
+ 2. **SVD Matrix Factorization**: Discovers latent patterns in rating data
411
+ 3. **Hybrid Ensemble**: Weighted combination (60% CF, 40% SVD) for optimal results
412
+
413
+ ### Technology Stack:
414
+ - Python, Gradio, Scikit-learn, Pandas, NumPy, Plotly
415
+
416
+ ---
417
+
418
+ **Developed for DataSynthis ML Job Task**
419
+ """)
420
+
421
+ gr.Markdown("""
422
+ ---
423
+ <div style='text-align: center; color: #666;'>
424
+ <p>DataSynthis Movie Recommendation System | Deployed on Hugging Face Spaces</p>
425
+ <p>Built with Gradio</p>
426
+ </div>
427
+ """)
428
+
429
+ # Launch the app
430
+ if __name__ == "__main__":
431
+ app.launch()