import streamlit as st import pandas as pd from sqlalchemy import create_engine import seaborn as sns import matplotlib.pyplot as plt import numpy as np import os # Database Connection with secrets for Hugging Face # Using environment variables for credentials host = "gateway01.eu-central-1.prod.aws.tidbcloud.com" port = 4000 database = "grab" # Load credentials from secrets user = st.secrets["TIDB_USER"] password = st.secrets["TIDB_PASSWORD"] # For Hugging Face web deployment, remove SSL certificate path # Use SSL mode instead of certificate file engine = create_engine( f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}", connect_args={"ssl": {"ssl_mode": "REQUIRED"}} ) #load_data @st.cache_data(ttl=3600) # Cache for 1 hour def load_data(): query = "SELECT * FROM movies" try: return pd.read_sql(query, engine) except Exception as e: st.error(f"Database connection error: {e}") # Return sample data if connection fails return pd.DataFrame({ 'title': ['Sample Movie 1', 'Sample Movie 2'], 'genre': ['Action', 'Drama'], 'rating': [8.5, 7.9], 'votes': [100000, 80000], 'duration_minutes': [120, 95] }) df = load_data() # Check if data loaded successfully if df.empty: st.error("No data loaded from database. Check connection settings.") st.stop() #front_page st.title("IMDb Movie Analytics Dashboard") # Initialize session state if 'dashboard' not in st.session_state: st.session_state['dashboard'] = False if not st.session_state['dashboard']: if st.button("Go to Dashboard", type="primary"): st.session_state['dashboard'] = True st.rerun() # Show some basic stats on front page st.markdown("---") col1, col2, col3 = st.columns(3) with col1: st.metric("Total Movies", len(df)) with col2: st.metric("Unique Genres", df["genre"].nunique()) with col3: st.metric("Avg Rating", f"{df['rating'].mean():.2f}") st.markdown("### Quick Preview") st.dataframe(df.head(10)) st.stop() if st.session_state['dashboard']: # Add a back button if st.button("← Back to Home"): st.session_state['dashboard'] = False st.rerun() #menu st.sidebar.title("🎬 Navigation") selected_tab = st.sidebar.radio( "Select Section", ["Top 10 Movies", "Movie Analysis", "All Movies Data", "Data Analytics"] ) #tab1_top10_movies if selected_tab == "Top 10 Movies": st.header("🏆 Top 10 Movies") genre_list = list(df["genre"].unique()) genre_select_mode = st.radio("Genre Filter", ["All Genres", "Custom Selection"], horizontal=True) if genre_select_mode == "All Genres": selected_top_genre = genre_list st.info("Showing Top 10 Movies In All Genres") else: selected_top_genre = st.multiselect("Select Genres for Top 10 Movies", genre_list, default=genre_list[:3]) if not selected_top_genre: st.warning("Select at least one genre to show") st.stop() top_df = df[df["genre"].isin(selected_top_genre)].copy() name_col = None for col in ["title", "name", "movie_name"]: if col in top_df.columns: name_col = col break if not name_col: st.error("No valid movie name column found") st.stop() sort_option = st.radio("Sort Top 10 By", ["Rating", "Votes", "Rating & Votes"], horizontal=True) if sort_option == "Rating": sorted_df = top_df.sort_values(by="rating", ascending=False) elif sort_option == "Votes": sorted_df = top_df.sort_values(by="votes", ascending=False) else: top_df["score"] = top_df["rating"] * np.log(top_df["votes"] + 1) sorted_df = top_df.sort_values(by="score", ascending=False) top_movies = sorted_df.drop_duplicates(subset=name_col).head(10) # Display results for i, (_, row) in enumerate(top_movies.iterrows(), 1): col1, col2 = st.columns([3, 1]) with col1: st.markdown(f"**#{i} {row[name_col]}**") st.markdown(f"Genre: {row['genre']} | Duration: {row.get('duration', 'N/A')} min") with col2: st.markdown(f"⭐ **{row['rating']:.1f}/10**") st.markdown(f"👥 {row['votes']:,} votes") st.divider() # Also show as dataframe with st.expander("📋 View as Table"): display_columns = [name_col, "genre","duration","rating", "votes"] st.dataframe(top_movies[display_columns]) #tab2_Movie Analysis elif selected_tab == "Movie Analysis": st.header("📊 Movie Analysis") # Create tabs for different analyses analysis_tab1, analysis_tab2, analysis_tab3 = st.tabs(["Genre Analysis", "Ratings & Votes", "Duration Analysis"]) with analysis_tab1: st.subheader("Genre Distribution") genre_counts = df["genre"].value_counts().reset_index() genre_counts.columns = ["Genre", "Count"] f1, ax1 = plt.subplots(figsize=(10, 6)) sns.barplot(data=genre_counts, x="Genre", y="Count", palette="viridis", ax=ax1) ax1.set_title("Number of Movies per Genre") ax1.set_xlabel("Genre") ax1.set_ylabel("Number of Movies") ax1.tick_params(axis='x', rotation=45) st.pyplot(f1) st.subheader("Most Popular Genres by Voting") total_votes_per_genre = df.groupby("genre")["votes"].sum().sort_values(ascending=False) f5, ax5 = plt.subplots(figsize=(8, 8)) ax5.pie(total_votes_per_genre, labels=total_votes_per_genre.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("pastel")) ax5.set_title("Most Popular Genres by Total Voting Counts") ax5.axis("equal") st.pyplot(f5) with analysis_tab2: #vote_trends st.subheader("Voting Trends by Genre") avg_votes = df.groupby("genre")["votes"].mean().sort_values(ascending=True).reset_index() f3, ax3 = plt.subplots(figsize=(10, 6)) sns.barplot(data=avg_votes, x="votes", y="genre", palette="cubehelix", ax=ax3) ax3.set_title("Average Voting Count per Genre") ax3.set_xlabel("Average Votes") ax3.set_ylabel("Genre") st.pyplot(f3) #rating_distribution st.subheader("Rating Distribution") f4, ax4 = plt.subplots(figsize=(10, 6)) sns.boxplot(data=df, x="rating", color="lightcoral", ax=ax4) ax4.set_title("Movie Ratings in Box plot") ax4.set_xlabel("Rating") st.pyplot(f4) #heatmap st.subheader("Ratings by Genre") avg_rating_genre = df.groupby("genre")["rating"].mean().reset_index() avg_rating_genre_pivot = avg_rating_genre.pivot_table(index="genre", values="rating") f7, ax6 = plt.subplots(figsize=(8, len(avg_rating_genre_pivot) * 0.5 + 2)) sns.heatmap(avg_rating_genre_pivot, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax6) ax6.set_title("Average Rating by Genre") ax6.set_ylabel("Genre") st.pyplot(f7) #correlation st.subheader("Correlation Analysis") f8, ax7 = plt.subplots(figsize=(10, 6)) sns.scatterplot(data=df, x="votes", y="rating", hue="genre", alpha=0.7, palette="husl", ax=ax7) ax7.set_title("Relationship Between Votes and Ratings") ax7.set_xlabel("Votes") ax7.set_ylabel("Rating") ax7.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Genre") st.pyplot(f8) with analysis_tab3: #movie_duration st.subheader("Average Duration by Genre") avg_duration = df.groupby("genre")["duration_minutes"].mean().sort_values(ascending=True).reset_index() f2, ax2 = plt.subplots(figsize=(10, 6)) sns.barplot(data=avg_duration, x="duration_minutes", y="genre", palette="mako", ax=ax2) ax2.set_title("Average Movie Duration per Genre") ax2.set_xlabel("Average Duration (In Minutes)") ax2.set_ylabel("Genre") st.pyplot(f2) #duration_distribution st.subheader("Movie Duration Distribution") f6, ax8 = plt.subplots(figsize=(10, 6)) sns.boxplot(data=df, x="duration_minutes", color="skyblue", ax=ax8) ax8.set_title("Movie Durations in Box plot") ax8.set_xlabel("Duration (In Minutes)") st.pyplot(f6) #rating_leaders st.subheader("Genre-Based Rating Leaders") title_col = None for col in ["title", "name", "movie_name"]: if col in df.columns: title_col = col break if title_col: top_rated_per_genre = df.sort_values(by="rating", ascending=False).drop_duplicates(subset=["genre"]) top_rated_per_genre = top_rated_per_genre[["genre", title_col, "rating", "votes"]].sort_values(by="genre") top_rated_per_genre.columns = ["Genre", "Top Movie", "Rating", "Votes"] st.dataframe(top_rated_per_genre, use_container_width=True) else: st.warning("No title column found") #duration_extremes st.subheader("Duration Extremes") if title_col: valid_durations = df[df["duration_minutes"] > 0] if not valid_durations.empty: shortest = valid_durations.loc[valid_durations["duration_minutes"].idxmin()] longest = df.loc[df["duration_minutes"].idxmax()] def minutes_to_text(minutes): h = minutes // 60 m = minutes % 60 return f"{int(h)}h {int(m)}m" extremes_df = pd.DataFrame([ { "Type": "Shortest", "Title": shortest[title_col], "Genre": shortest["genre"], "Duration": minutes_to_text(shortest["duration_minutes"]) }, { "Type": "Longest", "Title": longest[title_col], "Genre": longest["genre"], "Duration": minutes_to_text(longest["duration_minutes"]) } ]) st.table(extremes_df) else: st.warning("No movie durations > 0") else: st.warning("Movie titles not found") #tab3_allmovie_data elif selected_tab == "All Movies Data": st.header("🎞️ All Movies Data") title_col = None for col in ["movie_name", "duration", "rating", "votes"]: if col in df.columns: title_col = col break if not title_col: st.error("No movie titles column found") else: display_cols = [title_col, "duration", "rating", "votes"] selected_all_genre = st.selectbox( "Select Genre to View All Movies", ["All Genres"] + list(df["genre"].unique()) ) if selected_all_genre == "All Genres": total_count = len(df) st.markdown(f"**Total Movies:** {total_count}") st.dataframe(df[display_cols]) else: filtered_df = df[df["genre"] == selected_all_genre] total_count = len(filtered_df) st.markdown(f"**Total Movies in {selected_all_genre}:** {total_count}") st.dataframe(filtered_df[display_cols]) #tab4_data_analytics elif selected_tab == "Data Analytics": st.header("🔍 Data Analytics") st.sidebar.header("Custom Filters") genre_filter_mode = st.sidebar.radio("Genre Filter Mode", ["All Genres", "Custom Selection"]) if genre_filter_mode == "All Genres": selected_genre = df["genre"].unique().tolist() else: selected_genre = st.sidebar.multiselect("Select Genre(s)", df["genre"].unique().tolist(), default=df["genre"].unique().tolist()[:3]) if not selected_genre: st.sidebar.warning("Please select at least one genre to apply filters") duration_filter = st.sidebar.slider("Select Movie Duration (Minutes)", 0, 300, (90, 180)) rating_filter = st.sidebar.slider("Select Minimum Rating", 0.0, 10.0, 7.0) votes_filter = st.sidebar.slider("Select Minimum Votes", 0, 500000, 10000) #filters filtered_df = df[ (df["duration_minutes"].between(duration_filter[0], duration_filter[1])) & (df["rating"] >= rating_filter) & (df["votes"] >= votes_filter) & (df["genre"].isin(selected_genre)) ] #filter_view st.subheader("Filtered Movies") st.write(f"Showing {len(filtered_df)} movies matching your filters") if not filtered_df.empty: col1, col2, col3 = st.columns(3) with col1: st.metric("Avg Rating", f"{filtered_df['rating'].mean():.2f}") with col2: st.metric("Avg Duration", f"{filtered_df['duration_minutes'].mean():.1f} min") with col3: st.metric("Avg Votes", f"{filtered_df['votes'].mean():,.0f}") st.dataframe(filtered_df) else: st.warning("No movies match the selected filters. Try adjusting your criteria.") # Footer st.markdown("---") st.markdown("""

🎬 IMDb Movie Analytics Dashboard | Data from TiDB Cloud | Built with Streamlit

""", unsafe_allow_html=True)