Spaces:

prasanthr0416
/

IMDB_2024_Data_Analytics

Sleeping

IMDB_2024_Data_Analytics

File size: 14,363 Bytes

e67e2ef

import streamlit as st
import pandas as pd
from sqlalchemy import create_engine
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# Database Connection with secrets for Hugging Face
# Using environment variables for credentials
host = "gateway01.eu-central-1.prod.aws.tidbcloud.com"
port = 4000
database = "grab"

# Load credentials from secrets
user = st.secrets["TIDB_USER"]
password = st.secrets["TIDB_PASSWORD"]

# For Hugging Face web deployment, remove SSL certificate path
# Use SSL mode instead of certificate file
engine = create_engine(
    f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}",
    connect_args={"ssl": {"ssl_mode": "REQUIRED"}}
)

#load_data
@st.cache_data(ttl=3600)  # Cache for 1 hour
def load_data():
    query = "SELECT * FROM movies"
    try:
        return pd.read_sql(query, engine)
    except Exception as e:
        st.error(f"Database connection error: {e}")
        # Return sample data if connection fails
        return pd.DataFrame({
            'title': ['Sample Movie 1', 'Sample Movie 2'],
            'genre': ['Action', 'Drama'],
            'rating': [8.5, 7.9],
            'votes': [100000, 80000],
            'duration_minutes': [120, 95]
        })

df = load_data()

# Check if data loaded successfully
if df.empty:
    st.error("No data loaded from database. Check connection settings.")
    st.stop()

#front_page
st.title("IMDb Movie Analytics Dashboard")

# Initialize session state
if 'dashboard' not in st.session_state:
    st.session_state['dashboard'] = False

if not st.session_state['dashboard']:
    if st.button("Go to Dashboard", type="primary"):
        st.session_state['dashboard'] = True
        st.rerun()
    
    # Show some basic stats on front page
    st.markdown("---")
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Movies", len(df))
    with col2:
        st.metric("Unique Genres", df["genre"].nunique())
    with col3:
        st.metric("Avg Rating", f"{df['rating'].mean():.2f}")
    
    st.markdown("### Quick Preview")
    st.dataframe(df.head(10))
    
    st.stop()

if st.session_state['dashboard']:
    # Add a back button
    if st.button("← Back to Home"):
        st.session_state['dashboard'] = False
        st.rerun()

    #menu
    st.sidebar.title("🎬 Navigation")
    selected_tab = st.sidebar.radio(
        "Select Section",
        ["Top 10 Movies", "Movie Analysis", "All Movies Data", "Data Analytics"]
    )

    #tab1_top10_movies
    if selected_tab == "Top 10 Movies":
        st.header("🏆 Top 10 Movies")
        
        genre_list = list(df["genre"].unique())
        genre_select_mode = st.radio("Genre Filter", ["All Genres", "Custom Selection"], horizontal=True)

        if genre_select_mode == "All Genres":
            selected_top_genre = genre_list
            st.info("Showing Top 10 Movies In All Genres")
        else:
            selected_top_genre = st.multiselect("Select Genres for Top 10 Movies", genre_list, default=genre_list[:3])
            if not selected_top_genre:
                st.warning("Select at least one genre to show")
                st.stop()

        top_df = df[df["genre"].isin(selected_top_genre)].copy()

        name_col = None
        for col in ["title", "name", "movie_name"]:
            if col in top_df.columns:
                name_col = col
                break
        if not name_col:
            st.error("No valid movie name column found")
            st.stop()

        sort_option = st.radio("Sort Top 10 By", ["Rating", "Votes", "Rating & Votes"], horizontal=True)

        if sort_option == "Rating":
            sorted_df = top_df.sort_values(by="rating", ascending=False)
        elif sort_option == "Votes":
            sorted_df = top_df.sort_values(by="votes", ascending=False)
        else:
            top_df["score"] = top_df["rating"] * np.log(top_df["votes"] + 1)
            sorted_df = top_df.sort_values(by="score", ascending=False)

        top_movies = sorted_df.drop_duplicates(subset=name_col).head(10)

        # Display results
        for i, (_, row) in enumerate(top_movies.iterrows(), 1):
            col1, col2 = st.columns([3, 1])
            with col1:
                st.markdown(f"**#{i} {row[name_col]}**")
                st.markdown(f"Genre: {row['genre']} | Duration: {row.get('duration', 'N/A')} min")
            with col2:
                st.markdown(f"⭐ **{row['rating']:.1f}/10**")
                st.markdown(f"👥 {row['votes']:,} votes")
            st.divider()

        # Also show as dataframe
        with st.expander("📋 View as Table"):
            display_columns = [name_col, "genre","duration","rating", "votes"]
            st.dataframe(top_movies[display_columns])

    #tab2_Movie Analysis
    elif selected_tab == "Movie Analysis":
        st.header("📊 Movie Analysis")
        
        # Create tabs for different analyses
        analysis_tab1, analysis_tab2, analysis_tab3 = st.tabs(["Genre Analysis", "Ratings & Votes", "Duration Analysis"])

        with analysis_tab1:
            st.subheader("Genre Distribution")
            genre_counts = df["genre"].value_counts().reset_index()
            genre_counts.columns = ["Genre", "Count"]

            f1, ax1 = plt.subplots(figsize=(10, 6))
            sns.barplot(data=genre_counts, x="Genre", y="Count", palette="viridis", ax=ax1)
            ax1.set_title("Number of Movies per Genre")
            ax1.set_xlabel("Genre")
            ax1.set_ylabel("Number of Movies")
            ax1.tick_params(axis='x', rotation=45)
            st.pyplot(f1)

            st.subheader("Most Popular Genres by Voting")
            total_votes_per_genre = df.groupby("genre")["votes"].sum().sort_values(ascending=False)
            f5, ax5 = plt.subplots(figsize=(8, 8))
            ax5.pie(total_votes_per_genre, labels=total_votes_per_genre.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("pastel"))
            ax5.set_title("Most Popular Genres by Total Voting Counts")
            ax5.axis("equal")
            st.pyplot(f5)

        with analysis_tab2:
            #vote_trends
            st.subheader("Voting Trends by Genre")
            avg_votes = df.groupby("genre")["votes"].mean().sort_values(ascending=True).reset_index()

            f3, ax3 = plt.subplots(figsize=(10, 6))
            sns.barplot(data=avg_votes, x="votes", y="genre", palette="cubehelix", ax=ax3)
            ax3.set_title("Average Voting Count per Genre")
            ax3.set_xlabel("Average Votes")
            ax3.set_ylabel("Genre")
            st.pyplot(f3)

            #rating_distribution
            st.subheader("Rating Distribution")
            f4, ax4 = plt.subplots(figsize=(10, 6))
            sns.boxplot(data=df, x="rating", color="lightcoral", ax=ax4)
            ax4.set_title("Movie Ratings in Box plot")
            ax4.set_xlabel("Rating")
            st.pyplot(f4)

            #heatmap
            st.subheader("Ratings by Genre")
            avg_rating_genre = df.groupby("genre")["rating"].mean().reset_index()
            avg_rating_genre_pivot = avg_rating_genre.pivot_table(index="genre", values="rating")

            f7, ax6 = plt.subplots(figsize=(8, len(avg_rating_genre_pivot) * 0.5 + 2))
            sns.heatmap(avg_rating_genre_pivot, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax6)
            ax6.set_title("Average Rating by Genre")
            ax6.set_ylabel("Genre")
            st.pyplot(f7)

            #correlation
            st.subheader("Correlation Analysis")
            f8, ax7 = plt.subplots(figsize=(10, 6))
            sns.scatterplot(data=df, x="votes", y="rating", hue="genre", alpha=0.7, palette="husl", ax=ax7)
            ax7.set_title("Relationship Between Votes and Ratings")
            ax7.set_xlabel("Votes")
            ax7.set_ylabel("Rating")
            ax7.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Genre")
            st.pyplot(f8)

        with analysis_tab3:
            #movie_duration
            st.subheader("Average Duration by Genre")
            avg_duration = df.groupby("genre")["duration_minutes"].mean().sort_values(ascending=True).reset_index()

            f2, ax2 = plt.subplots(figsize=(10, 6))
            sns.barplot(data=avg_duration, x="duration_minutes", y="genre", palette="mako", ax=ax2)
            ax2.set_title("Average Movie Duration per Genre")
            ax2.set_xlabel("Average Duration (In Minutes)")
            ax2.set_ylabel("Genre")
            st.pyplot(f2)

            #duration_distribution
            st.subheader("Movie Duration Distribution")
            f6, ax8 = plt.subplots(figsize=(10, 6))
            sns.boxplot(data=df, x="duration_minutes", color="skyblue", ax=ax8)
            ax8.set_title("Movie Durations in Box plot")
            ax8.set_xlabel("Duration (In Minutes)")
            st.pyplot(f6)

            #rating_leaders
            st.subheader("Genre-Based Rating Leaders")
            title_col = None
            for col in ["title", "name", "movie_name"]:
                if col in df.columns:
                    title_col = col
                    break

            if title_col:
                top_rated_per_genre = df.sort_values(by="rating", ascending=False).drop_duplicates(subset=["genre"])
                top_rated_per_genre = top_rated_per_genre[["genre", title_col, "rating", "votes"]].sort_values(by="genre")
                top_rated_per_genre.columns = ["Genre", "Top Movie", "Rating", "Votes"]
                st.dataframe(top_rated_per_genre, use_container_width=True)
            else:
                st.warning("No title column found")

            #duration_extremes
            st.subheader("Duration Extremes")
            if title_col:
                valid_durations = df[df["duration_minutes"] > 0]

                if not valid_durations.empty:
                    shortest = valid_durations.loc[valid_durations["duration_minutes"].idxmin()]
                    longest = df.loc[df["duration_minutes"].idxmax()]

                    def minutes_to_text(minutes):
                        h = minutes // 60
                        m = minutes % 60
                        return f"{int(h)}h {int(m)}m"

                    extremes_df = pd.DataFrame([
                        {
                            "Type": "Shortest",
                            "Title": shortest[title_col],
                            "Genre": shortest["genre"],
                            "Duration": minutes_to_text(shortest["duration_minutes"])
                        },
                        {
                            "Type": "Longest",
                            "Title": longest[title_col],
                            "Genre": longest["genre"],
                            "Duration": minutes_to_text(longest["duration_minutes"])
                        }
                    ])

                    st.table(extremes_df)
                else:
                    st.warning("No movie durations > 0")
            else:
                st.warning("Movie titles not found")

    #tab3_allmovie_data
    elif selected_tab == "All Movies Data":
        st.header("🎞️ All Movies Data")

        title_col = None
        for col in ["movie_name", "duration", "rating", "votes"]:
            if col in df.columns:
                title_col = col
                break

        if not title_col:
            st.error("No movie titles column found")
        else:
            display_cols = [title_col, "duration", "rating", "votes"]

            selected_all_genre = st.selectbox(
                "Select Genre to View All Movies",
                ["All Genres"] + list(df["genre"].unique())
            )

            if selected_all_genre == "All Genres":
                total_count = len(df)
                st.markdown(f"**Total Movies:** {total_count}")
                st.dataframe(df[display_cols])
            else:
                filtered_df = df[df["genre"] == selected_all_genre]
                total_count = len(filtered_df)
                st.markdown(f"**Total Movies in {selected_all_genre}:** {total_count}")
                st.dataframe(filtered_df[display_cols])

    #tab4_data_analytics
    elif selected_tab == "Data Analytics":
        st.header("🔍 Data Analytics")
        
        st.sidebar.header("Custom Filters")
        genre_filter_mode = st.sidebar.radio("Genre Filter Mode", ["All Genres", "Custom Selection"])

        if genre_filter_mode == "All Genres":
            selected_genre = df["genre"].unique().tolist()
        else:
            selected_genre = st.sidebar.multiselect("Select Genre(s)", df["genre"].unique().tolist(), default=df["genre"].unique().tolist()[:3])
            if not selected_genre:
                st.sidebar.warning("Please select at least one genre to apply filters")

        duration_filter = st.sidebar.slider("Select Movie Duration (Minutes)", 0, 300, (90, 180))
        rating_filter = st.sidebar.slider("Select Minimum Rating", 0.0, 10.0, 7.0)
        votes_filter = st.sidebar.slider("Select Minimum Votes", 0, 500000, 10000)

        #filters
        filtered_df = df[
            (df["duration_minutes"].between(duration_filter[0], duration_filter[1])) &
            (df["rating"] >= rating_filter) &
            (df["votes"] >= votes_filter) &
            (df["genre"].isin(selected_genre))
        ]

        #filter_view
        st.subheader("Filtered Movies")
        st.write(f"Showing {len(filtered_df)} movies matching your filters")
        
        if not filtered_df.empty:
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Avg Rating", f"{filtered_df['rating'].mean():.2f}")
            with col2:
                st.metric("Avg Duration", f"{filtered_df['duration_minutes'].mean():.1f} min")
            with col3:
                st.metric("Avg Votes", f"{filtered_df['votes'].mean():,.0f}")
            
            st.dataframe(filtered_df)
        else:
            st.warning("No movies match the selected filters. Try adjusting your criteria.")

    # Footer
    st.markdown("---")
    st.markdown("""
    <div style='text-align: center'>
        <p>🎬 IMDb Movie Analytics Dashboard | Data from TiDB Cloud | Built with Streamlit</p>
    </div>
    """, unsafe_allow_html=True)