prasanthr0416 commited on
Commit
e67e2ef
·
verified ·
1 Parent(s): 5c15010

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +364 -0
app.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sqlalchemy import create_engine
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import os
8
+
9
+ # Database Connection with secrets for Hugging Face
10
+ # Using environment variables for credentials
11
+ host = "gateway01.eu-central-1.prod.aws.tidbcloud.com"
12
+ port = 4000
13
+ database = "grab"
14
+
15
+ # Load credentials from secrets
16
+ user = st.secrets["TIDB_USER"]
17
+ password = st.secrets["TIDB_PASSWORD"]
18
+
19
+ # For Hugging Face web deployment, remove SSL certificate path
20
+ # Use SSL mode instead of certificate file
21
+ engine = create_engine(
22
+ f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}",
23
+ connect_args={"ssl": {"ssl_mode": "REQUIRED"}}
24
+ )
25
+
26
+ #load_data
27
+ @st.cache_data(ttl=3600) # Cache for 1 hour
28
+ def load_data():
29
+ query = "SELECT * FROM movies"
30
+ try:
31
+ return pd.read_sql(query, engine)
32
+ except Exception as e:
33
+ st.error(f"Database connection error: {e}")
34
+ # Return sample data if connection fails
35
+ return pd.DataFrame({
36
+ 'title': ['Sample Movie 1', 'Sample Movie 2'],
37
+ 'genre': ['Action', 'Drama'],
38
+ 'rating': [8.5, 7.9],
39
+ 'votes': [100000, 80000],
40
+ 'duration_minutes': [120, 95]
41
+ })
42
+
43
+ df = load_data()
44
+
45
+ # Check if data loaded successfully
46
+ if df.empty:
47
+ st.error("No data loaded from database. Check connection settings.")
48
+ st.stop()
49
+
50
+ #front_page
51
+ st.title("IMDb Movie Analytics Dashboard")
52
+
53
+ # Initialize session state
54
+ if 'dashboard' not in st.session_state:
55
+ st.session_state['dashboard'] = False
56
+
57
+ if not st.session_state['dashboard']:
58
+ if st.button("Go to Dashboard", type="primary"):
59
+ st.session_state['dashboard'] = True
60
+ st.rerun()
61
+
62
+ # Show some basic stats on front page
63
+ st.markdown("---")
64
+ col1, col2, col3 = st.columns(3)
65
+ with col1:
66
+ st.metric("Total Movies", len(df))
67
+ with col2:
68
+ st.metric("Unique Genres", df["genre"].nunique())
69
+ with col3:
70
+ st.metric("Avg Rating", f"{df['rating'].mean():.2f}")
71
+
72
+ st.markdown("### Quick Preview")
73
+ st.dataframe(df.head(10))
74
+
75
+ st.stop()
76
+
77
+ if st.session_state['dashboard']:
78
+ # Add a back button
79
+ if st.button("← Back to Home"):
80
+ st.session_state['dashboard'] = False
81
+ st.rerun()
82
+
83
+ #menu
84
+ st.sidebar.title("🎬 Navigation")
85
+ selected_tab = st.sidebar.radio(
86
+ "Select Section",
87
+ ["Top 10 Movies", "Movie Analysis", "All Movies Data", "Data Analytics"]
88
+ )
89
+
90
+ #tab1_top10_movies
91
+ if selected_tab == "Top 10 Movies":
92
+ st.header("🏆 Top 10 Movies")
93
+
94
+ genre_list = list(df["genre"].unique())
95
+ genre_select_mode = st.radio("Genre Filter", ["All Genres", "Custom Selection"], horizontal=True)
96
+
97
+ if genre_select_mode == "All Genres":
98
+ selected_top_genre = genre_list
99
+ st.info("Showing Top 10 Movies In All Genres")
100
+ else:
101
+ selected_top_genre = st.multiselect("Select Genres for Top 10 Movies", genre_list, default=genre_list[:3])
102
+ if not selected_top_genre:
103
+ st.warning("Select at least one genre to show")
104
+ st.stop()
105
+
106
+ top_df = df[df["genre"].isin(selected_top_genre)].copy()
107
+
108
+ name_col = None
109
+ for col in ["title", "name", "movie_name"]:
110
+ if col in top_df.columns:
111
+ name_col = col
112
+ break
113
+ if not name_col:
114
+ st.error("No valid movie name column found")
115
+ st.stop()
116
+
117
+ sort_option = st.radio("Sort Top 10 By", ["Rating", "Votes", "Rating & Votes"], horizontal=True)
118
+
119
+ if sort_option == "Rating":
120
+ sorted_df = top_df.sort_values(by="rating", ascending=False)
121
+ elif sort_option == "Votes":
122
+ sorted_df = top_df.sort_values(by="votes", ascending=False)
123
+ else:
124
+ top_df["score"] = top_df["rating"] * np.log(top_df["votes"] + 1)
125
+ sorted_df = top_df.sort_values(by="score", ascending=False)
126
+
127
+ top_movies = sorted_df.drop_duplicates(subset=name_col).head(10)
128
+
129
+ # Display results
130
+ for i, (_, row) in enumerate(top_movies.iterrows(), 1):
131
+ col1, col2 = st.columns([3, 1])
132
+ with col1:
133
+ st.markdown(f"**#{i} {row[name_col]}**")
134
+ st.markdown(f"Genre: {row['genre']} | Duration: {row.get('duration', 'N/A')} min")
135
+ with col2:
136
+ st.markdown(f"⭐ **{row['rating']:.1f}/10**")
137
+ st.markdown(f"👥 {row['votes']:,} votes")
138
+ st.divider()
139
+
140
+ # Also show as dataframe
141
+ with st.expander("📋 View as Table"):
142
+ display_columns = [name_col, "genre","duration","rating", "votes"]
143
+ st.dataframe(top_movies[display_columns])
144
+
145
+ #tab2_Movie Analysis
146
+ elif selected_tab == "Movie Analysis":
147
+ st.header("📊 Movie Analysis")
148
+
149
+ # Create tabs for different analyses
150
+ analysis_tab1, analysis_tab2, analysis_tab3 = st.tabs(["Genre Analysis", "Ratings & Votes", "Duration Analysis"])
151
+
152
+ with analysis_tab1:
153
+ st.subheader("Genre Distribution")
154
+ genre_counts = df["genre"].value_counts().reset_index()
155
+ genre_counts.columns = ["Genre", "Count"]
156
+
157
+ f1, ax1 = plt.subplots(figsize=(10, 6))
158
+ sns.barplot(data=genre_counts, x="Genre", y="Count", palette="viridis", ax=ax1)
159
+ ax1.set_title("Number of Movies per Genre")
160
+ ax1.set_xlabel("Genre")
161
+ ax1.set_ylabel("Number of Movies")
162
+ ax1.tick_params(axis='x', rotation=45)
163
+ st.pyplot(f1)
164
+
165
+ st.subheader("Most Popular Genres by Voting")
166
+ total_votes_per_genre = df.groupby("genre")["votes"].sum().sort_values(ascending=False)
167
+ f5, ax5 = plt.subplots(figsize=(8, 8))
168
+ ax5.pie(total_votes_per_genre, labels=total_votes_per_genre.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("pastel"))
169
+ ax5.set_title("Most Popular Genres by Total Voting Counts")
170
+ ax5.axis("equal")
171
+ st.pyplot(f5)
172
+
173
+ with analysis_tab2:
174
+ #vote_trends
175
+ st.subheader("Voting Trends by Genre")
176
+ avg_votes = df.groupby("genre")["votes"].mean().sort_values(ascending=True).reset_index()
177
+
178
+ f3, ax3 = plt.subplots(figsize=(10, 6))
179
+ sns.barplot(data=avg_votes, x="votes", y="genre", palette="cubehelix", ax=ax3)
180
+ ax3.set_title("Average Voting Count per Genre")
181
+ ax3.set_xlabel("Average Votes")
182
+ ax3.set_ylabel("Genre")
183
+ st.pyplot(f3)
184
+
185
+ #rating_distribution
186
+ st.subheader("Rating Distribution")
187
+ f4, ax4 = plt.subplots(figsize=(10, 6))
188
+ sns.boxplot(data=df, x="rating", color="lightcoral", ax=ax4)
189
+ ax4.set_title("Movie Ratings in Box plot")
190
+ ax4.set_xlabel("Rating")
191
+ st.pyplot(f4)
192
+
193
+ #heatmap
194
+ st.subheader("Ratings by Genre")
195
+ avg_rating_genre = df.groupby("genre")["rating"].mean().reset_index()
196
+ avg_rating_genre_pivot = avg_rating_genre.pivot_table(index="genre", values="rating")
197
+
198
+ f7, ax6 = plt.subplots(figsize=(8, len(avg_rating_genre_pivot) * 0.5 + 2))
199
+ sns.heatmap(avg_rating_genre_pivot, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax6)
200
+ ax6.set_title("Average Rating by Genre")
201
+ ax6.set_ylabel("Genre")
202
+ st.pyplot(f7)
203
+
204
+ #correlation
205
+ st.subheader("Correlation Analysis")
206
+ f8, ax7 = plt.subplots(figsize=(10, 6))
207
+ sns.scatterplot(data=df, x="votes", y="rating", hue="genre", alpha=0.7, palette="husl", ax=ax7)
208
+ ax7.set_title("Relationship Between Votes and Ratings")
209
+ ax7.set_xlabel("Votes")
210
+ ax7.set_ylabel("Rating")
211
+ ax7.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Genre")
212
+ st.pyplot(f8)
213
+
214
+ with analysis_tab3:
215
+ #movie_duration
216
+ st.subheader("Average Duration by Genre")
217
+ avg_duration = df.groupby("genre")["duration_minutes"].mean().sort_values(ascending=True).reset_index()
218
+
219
+ f2, ax2 = plt.subplots(figsize=(10, 6))
220
+ sns.barplot(data=avg_duration, x="duration_minutes", y="genre", palette="mako", ax=ax2)
221
+ ax2.set_title("Average Movie Duration per Genre")
222
+ ax2.set_xlabel("Average Duration (In Minutes)")
223
+ ax2.set_ylabel("Genre")
224
+ st.pyplot(f2)
225
+
226
+ #duration_distribution
227
+ st.subheader("Movie Duration Distribution")
228
+ f6, ax8 = plt.subplots(figsize=(10, 6))
229
+ sns.boxplot(data=df, x="duration_minutes", color="skyblue", ax=ax8)
230
+ ax8.set_title("Movie Durations in Box plot")
231
+ ax8.set_xlabel("Duration (In Minutes)")
232
+ st.pyplot(f6)
233
+
234
+ #rating_leaders
235
+ st.subheader("Genre-Based Rating Leaders")
236
+ title_col = None
237
+ for col in ["title", "name", "movie_name"]:
238
+ if col in df.columns:
239
+ title_col = col
240
+ break
241
+
242
+ if title_col:
243
+ top_rated_per_genre = df.sort_values(by="rating", ascending=False).drop_duplicates(subset=["genre"])
244
+ top_rated_per_genre = top_rated_per_genre[["genre", title_col, "rating", "votes"]].sort_values(by="genre")
245
+ top_rated_per_genre.columns = ["Genre", "Top Movie", "Rating", "Votes"]
246
+ st.dataframe(top_rated_per_genre, use_container_width=True)
247
+ else:
248
+ st.warning("No title column found")
249
+
250
+ #duration_extremes
251
+ st.subheader("Duration Extremes")
252
+ if title_col:
253
+ valid_durations = df[df["duration_minutes"] > 0]
254
+
255
+ if not valid_durations.empty:
256
+ shortest = valid_durations.loc[valid_durations["duration_minutes"].idxmin()]
257
+ longest = df.loc[df["duration_minutes"].idxmax()]
258
+
259
+ def minutes_to_text(minutes):
260
+ h = minutes // 60
261
+ m = minutes % 60
262
+ return f"{int(h)}h {int(m)}m"
263
+
264
+ extremes_df = pd.DataFrame([
265
+ {
266
+ "Type": "Shortest",
267
+ "Title": shortest[title_col],
268
+ "Genre": shortest["genre"],
269
+ "Duration": minutes_to_text(shortest["duration_minutes"])
270
+ },
271
+ {
272
+ "Type": "Longest",
273
+ "Title": longest[title_col],
274
+ "Genre": longest["genre"],
275
+ "Duration": minutes_to_text(longest["duration_minutes"])
276
+ }
277
+ ])
278
+
279
+ st.table(extremes_df)
280
+ else:
281
+ st.warning("No movie durations > 0")
282
+ else:
283
+ st.warning("Movie titles not found")
284
+
285
+ #tab3_allmovie_data
286
+ elif selected_tab == "All Movies Data":
287
+ st.header("🎞️ All Movies Data")
288
+
289
+ title_col = None
290
+ for col in ["movie_name", "duration", "rating", "votes"]:
291
+ if col in df.columns:
292
+ title_col = col
293
+ break
294
+
295
+ if not title_col:
296
+ st.error("No movie titles column found")
297
+ else:
298
+ display_cols = [title_col, "duration", "rating", "votes"]
299
+
300
+ selected_all_genre = st.selectbox(
301
+ "Select Genre to View All Movies",
302
+ ["All Genres"] + list(df["genre"].unique())
303
+ )
304
+
305
+ if selected_all_genre == "All Genres":
306
+ total_count = len(df)
307
+ st.markdown(f"**Total Movies:** {total_count}")
308
+ st.dataframe(df[display_cols])
309
+ else:
310
+ filtered_df = df[df["genre"] == selected_all_genre]
311
+ total_count = len(filtered_df)
312
+ st.markdown(f"**Total Movies in {selected_all_genre}:** {total_count}")
313
+ st.dataframe(filtered_df[display_cols])
314
+
315
+ #tab4_data_analytics
316
+ elif selected_tab == "Data Analytics":
317
+ st.header("🔍 Data Analytics")
318
+
319
+ st.sidebar.header("Custom Filters")
320
+ genre_filter_mode = st.sidebar.radio("Genre Filter Mode", ["All Genres", "Custom Selection"])
321
+
322
+ if genre_filter_mode == "All Genres":
323
+ selected_genre = df["genre"].unique().tolist()
324
+ else:
325
+ selected_genre = st.sidebar.multiselect("Select Genre(s)", df["genre"].unique().tolist(), default=df["genre"].unique().tolist()[:3])
326
+ if not selected_genre:
327
+ st.sidebar.warning("Please select at least one genre to apply filters")
328
+
329
+ duration_filter = st.sidebar.slider("Select Movie Duration (Minutes)", 0, 300, (90, 180))
330
+ rating_filter = st.sidebar.slider("Select Minimum Rating", 0.0, 10.0, 7.0)
331
+ votes_filter = st.sidebar.slider("Select Minimum Votes", 0, 500000, 10000)
332
+
333
+ #filters
334
+ filtered_df = df[
335
+ (df["duration_minutes"].between(duration_filter[0], duration_filter[1])) &
336
+ (df["rating"] >= rating_filter) &
337
+ (df["votes"] >= votes_filter) &
338
+ (df["genre"].isin(selected_genre))
339
+ ]
340
+
341
+ #filter_view
342
+ st.subheader("Filtered Movies")
343
+ st.write(f"Showing {len(filtered_df)} movies matching your filters")
344
+
345
+ if not filtered_df.empty:
346
+ col1, col2, col3 = st.columns(3)
347
+ with col1:
348
+ st.metric("Avg Rating", f"{filtered_df['rating'].mean():.2f}")
349
+ with col2:
350
+ st.metric("Avg Duration", f"{filtered_df['duration_minutes'].mean():.1f} min")
351
+ with col3:
352
+ st.metric("Avg Votes", f"{filtered_df['votes'].mean():,.0f}")
353
+
354
+ st.dataframe(filtered_df)
355
+ else:
356
+ st.warning("No movies match the selected filters. Try adjusting your criteria.")
357
+
358
+ # Footer
359
+ st.markdown("---")
360
+ st.markdown("""
361
+ <div style='text-align: center'>
362
+ <p>🎬 IMDb Movie Analytics Dashboard | Data from TiDB Cloud | Built with Streamlit</p>
363
+ </div>
364
+ """, unsafe_allow_html=True)