File size: 14,363 Bytes
e67e2ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import streamlit as st
import pandas as pd
from sqlalchemy import create_engine
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# Database Connection with secrets for Hugging Face
# Using environment variables for credentials
host = "gateway01.eu-central-1.prod.aws.tidbcloud.com"
port = 4000
database = "grab"

# Load credentials from secrets
user = st.secrets["TIDB_USER"]
password = st.secrets["TIDB_PASSWORD"]

# For Hugging Face web deployment, remove SSL certificate path
# Use SSL mode instead of certificate file
engine = create_engine(
    f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}",
    connect_args={"ssl": {"ssl_mode": "REQUIRED"}}
)

#load_data
@st.cache_data(ttl=3600)  # Cache for 1 hour
def load_data():
    query = "SELECT * FROM movies"
    try:
        return pd.read_sql(query, engine)
    except Exception as e:
        st.error(f"Database connection error: {e}")
        # Return sample data if connection fails
        return pd.DataFrame({
            'title': ['Sample Movie 1', 'Sample Movie 2'],
            'genre': ['Action', 'Drama'],
            'rating': [8.5, 7.9],
            'votes': [100000, 80000],
            'duration_minutes': [120, 95]
        })

df = load_data()

# Check if data loaded successfully
if df.empty:
    st.error("No data loaded from database. Check connection settings.")
    st.stop()

#front_page
st.title("IMDb Movie Analytics Dashboard")

# Initialize session state
if 'dashboard' not in st.session_state:
    st.session_state['dashboard'] = False

if not st.session_state['dashboard']:
    if st.button("Go to Dashboard", type="primary"):
        st.session_state['dashboard'] = True
        st.rerun()
    
    # Show some basic stats on front page
    st.markdown("---")
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Movies", len(df))
    with col2:
        st.metric("Unique Genres", df["genre"].nunique())
    with col3:
        st.metric("Avg Rating", f"{df['rating'].mean():.2f}")
    
    st.markdown("### Quick Preview")
    st.dataframe(df.head(10))
    
    st.stop()

if st.session_state['dashboard']:
    # Add a back button
    if st.button("← Back to Home"):
        st.session_state['dashboard'] = False
        st.rerun()

    #menu
    st.sidebar.title("🎬 Navigation")
    selected_tab = st.sidebar.radio(
        "Select Section",
        ["Top 10 Movies", "Movie Analysis", "All Movies Data", "Data Analytics"]
    )

    #tab1_top10_movies
    if selected_tab == "Top 10 Movies":
        st.header("πŸ† Top 10 Movies")
        
        genre_list = list(df["genre"].unique())
        genre_select_mode = st.radio("Genre Filter", ["All Genres", "Custom Selection"], horizontal=True)

        if genre_select_mode == "All Genres":
            selected_top_genre = genre_list
            st.info("Showing Top 10 Movies In All Genres")
        else:
            selected_top_genre = st.multiselect("Select Genres for Top 10 Movies", genre_list, default=genre_list[:3])
            if not selected_top_genre:
                st.warning("Select at least one genre to show")
                st.stop()

        top_df = df[df["genre"].isin(selected_top_genre)].copy()

        name_col = None
        for col in ["title", "name", "movie_name"]:
            if col in top_df.columns:
                name_col = col
                break
        if not name_col:
            st.error("No valid movie name column found")
            st.stop()

        sort_option = st.radio("Sort Top 10 By", ["Rating", "Votes", "Rating & Votes"], horizontal=True)

        if sort_option == "Rating":
            sorted_df = top_df.sort_values(by="rating", ascending=False)
        elif sort_option == "Votes":
            sorted_df = top_df.sort_values(by="votes", ascending=False)
        else:
            top_df["score"] = top_df["rating"] * np.log(top_df["votes"] + 1)
            sorted_df = top_df.sort_values(by="score", ascending=False)

        top_movies = sorted_df.drop_duplicates(subset=name_col).head(10)

        # Display results
        for i, (_, row) in enumerate(top_movies.iterrows(), 1):
            col1, col2 = st.columns([3, 1])
            with col1:
                st.markdown(f"**#{i} {row[name_col]}**")
                st.markdown(f"Genre: {row['genre']} | Duration: {row.get('duration', 'N/A')} min")
            with col2:
                st.markdown(f"⭐ **{row['rating']:.1f}/10**")
                st.markdown(f"πŸ‘₯ {row['votes']:,} votes")
            st.divider()

        # Also show as dataframe
        with st.expander("πŸ“‹ View as Table"):
            display_columns = [name_col, "genre","duration","rating", "votes"]
            st.dataframe(top_movies[display_columns])

    #tab2_Movie Analysis
    elif selected_tab == "Movie Analysis":
        st.header("πŸ“Š Movie Analysis")
        
        # Create tabs for different analyses
        analysis_tab1, analysis_tab2, analysis_tab3 = st.tabs(["Genre Analysis", "Ratings & Votes", "Duration Analysis"])

        with analysis_tab1:
            st.subheader("Genre Distribution")
            genre_counts = df["genre"].value_counts().reset_index()
            genre_counts.columns = ["Genre", "Count"]

            f1, ax1 = plt.subplots(figsize=(10, 6))
            sns.barplot(data=genre_counts, x="Genre", y="Count", palette="viridis", ax=ax1)
            ax1.set_title("Number of Movies per Genre")
            ax1.set_xlabel("Genre")
            ax1.set_ylabel("Number of Movies")
            ax1.tick_params(axis='x', rotation=45)
            st.pyplot(f1)

            st.subheader("Most Popular Genres by Voting")
            total_votes_per_genre = df.groupby("genre")["votes"].sum().sort_values(ascending=False)
            f5, ax5 = plt.subplots(figsize=(8, 8))
            ax5.pie(total_votes_per_genre, labels=total_votes_per_genre.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("pastel"))
            ax5.set_title("Most Popular Genres by Total Voting Counts")
            ax5.axis("equal")
            st.pyplot(f5)

        with analysis_tab2:
            #vote_trends
            st.subheader("Voting Trends by Genre")
            avg_votes = df.groupby("genre")["votes"].mean().sort_values(ascending=True).reset_index()

            f3, ax3 = plt.subplots(figsize=(10, 6))
            sns.barplot(data=avg_votes, x="votes", y="genre", palette="cubehelix", ax=ax3)
            ax3.set_title("Average Voting Count per Genre")
            ax3.set_xlabel("Average Votes")
            ax3.set_ylabel("Genre")
            st.pyplot(f3)

            #rating_distribution
            st.subheader("Rating Distribution")
            f4, ax4 = plt.subplots(figsize=(10, 6))
            sns.boxplot(data=df, x="rating", color="lightcoral", ax=ax4)
            ax4.set_title("Movie Ratings in Box plot")
            ax4.set_xlabel("Rating")
            st.pyplot(f4)

            #heatmap
            st.subheader("Ratings by Genre")
            avg_rating_genre = df.groupby("genre")["rating"].mean().reset_index()
            avg_rating_genre_pivot = avg_rating_genre.pivot_table(index="genre", values="rating")

            f7, ax6 = plt.subplots(figsize=(8, len(avg_rating_genre_pivot) * 0.5 + 2))
            sns.heatmap(avg_rating_genre_pivot, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax6)
            ax6.set_title("Average Rating by Genre")
            ax6.set_ylabel("Genre")
            st.pyplot(f7)

            #correlation
            st.subheader("Correlation Analysis")
            f8, ax7 = plt.subplots(figsize=(10, 6))
            sns.scatterplot(data=df, x="votes", y="rating", hue="genre", alpha=0.7, palette="husl", ax=ax7)
            ax7.set_title("Relationship Between Votes and Ratings")
            ax7.set_xlabel("Votes")
            ax7.set_ylabel("Rating")
            ax7.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Genre")
            st.pyplot(f8)

        with analysis_tab3:
            #movie_duration
            st.subheader("Average Duration by Genre")
            avg_duration = df.groupby("genre")["duration_minutes"].mean().sort_values(ascending=True).reset_index()

            f2, ax2 = plt.subplots(figsize=(10, 6))
            sns.barplot(data=avg_duration, x="duration_minutes", y="genre", palette="mako", ax=ax2)
            ax2.set_title("Average Movie Duration per Genre")
            ax2.set_xlabel("Average Duration (In Minutes)")
            ax2.set_ylabel("Genre")
            st.pyplot(f2)

            #duration_distribution
            st.subheader("Movie Duration Distribution")
            f6, ax8 = plt.subplots(figsize=(10, 6))
            sns.boxplot(data=df, x="duration_minutes", color="skyblue", ax=ax8)
            ax8.set_title("Movie Durations in Box plot")
            ax8.set_xlabel("Duration (In Minutes)")
            st.pyplot(f6)

            #rating_leaders
            st.subheader("Genre-Based Rating Leaders")
            title_col = None
            for col in ["title", "name", "movie_name"]:
                if col in df.columns:
                    title_col = col
                    break

            if title_col:
                top_rated_per_genre = df.sort_values(by="rating", ascending=False).drop_duplicates(subset=["genre"])
                top_rated_per_genre = top_rated_per_genre[["genre", title_col, "rating", "votes"]].sort_values(by="genre")
                top_rated_per_genre.columns = ["Genre", "Top Movie", "Rating", "Votes"]
                st.dataframe(top_rated_per_genre, use_container_width=True)
            else:
                st.warning("No title column found")

            #duration_extremes
            st.subheader("Duration Extremes")
            if title_col:
                valid_durations = df[df["duration_minutes"] > 0]

                if not valid_durations.empty:
                    shortest = valid_durations.loc[valid_durations["duration_minutes"].idxmin()]
                    longest = df.loc[df["duration_minutes"].idxmax()]

                    def minutes_to_text(minutes):
                        h = minutes // 60
                        m = minutes % 60
                        return f"{int(h)}h {int(m)}m"

                    extremes_df = pd.DataFrame([
                        {
                            "Type": "Shortest",
                            "Title": shortest[title_col],
                            "Genre": shortest["genre"],
                            "Duration": minutes_to_text(shortest["duration_minutes"])
                        },
                        {
                            "Type": "Longest",
                            "Title": longest[title_col],
                            "Genre": longest["genre"],
                            "Duration": minutes_to_text(longest["duration_minutes"])
                        }
                    ])

                    st.table(extremes_df)
                else:
                    st.warning("No movie durations > 0")
            else:
                st.warning("Movie titles not found")

    #tab3_allmovie_data
    elif selected_tab == "All Movies Data":
        st.header("🎞️ All Movies Data")

        title_col = None
        for col in ["movie_name", "duration", "rating", "votes"]:
            if col in df.columns:
                title_col = col
                break

        if not title_col:
            st.error("No movie titles column found")
        else:
            display_cols = [title_col, "duration", "rating", "votes"]

            selected_all_genre = st.selectbox(
                "Select Genre to View All Movies",
                ["All Genres"] + list(df["genre"].unique())
            )

            if selected_all_genre == "All Genres":
                total_count = len(df)
                st.markdown(f"**Total Movies:** {total_count}")
                st.dataframe(df[display_cols])
            else:
                filtered_df = df[df["genre"] == selected_all_genre]
                total_count = len(filtered_df)
                st.markdown(f"**Total Movies in {selected_all_genre}:** {total_count}")
                st.dataframe(filtered_df[display_cols])

    #tab4_data_analytics
    elif selected_tab == "Data Analytics":
        st.header("πŸ” Data Analytics")
        
        st.sidebar.header("Custom Filters")
        genre_filter_mode = st.sidebar.radio("Genre Filter Mode", ["All Genres", "Custom Selection"])

        if genre_filter_mode == "All Genres":
            selected_genre = df["genre"].unique().tolist()
        else:
            selected_genre = st.sidebar.multiselect("Select Genre(s)", df["genre"].unique().tolist(), default=df["genre"].unique().tolist()[:3])
            if not selected_genre:
                st.sidebar.warning("Please select at least one genre to apply filters")

        duration_filter = st.sidebar.slider("Select Movie Duration (Minutes)", 0, 300, (90, 180))
        rating_filter = st.sidebar.slider("Select Minimum Rating", 0.0, 10.0, 7.0)
        votes_filter = st.sidebar.slider("Select Minimum Votes", 0, 500000, 10000)

        #filters
        filtered_df = df[
            (df["duration_minutes"].between(duration_filter[0], duration_filter[1])) &
            (df["rating"] >= rating_filter) &
            (df["votes"] >= votes_filter) &
            (df["genre"].isin(selected_genre))
        ]

        #filter_view
        st.subheader("Filtered Movies")
        st.write(f"Showing {len(filtered_df)} movies matching your filters")
        
        if not filtered_df.empty:
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Avg Rating", f"{filtered_df['rating'].mean():.2f}")
            with col2:
                st.metric("Avg Duration", f"{filtered_df['duration_minutes'].mean():.1f} min")
            with col3:
                st.metric("Avg Votes", f"{filtered_df['votes'].mean():,.0f}")
            
            st.dataframe(filtered_df)
        else:
            st.warning("No movies match the selected filters. Try adjusting your criteria.")

    # Footer
    st.markdown("---")
    st.markdown("""
    <div style='text-align: center'>
        <p>🎬 IMDb Movie Analytics Dashboard | Data from TiDB Cloud | Built with Streamlit</p>
    </div>
    """, unsafe_allow_html=True)