prasanthr0416's picture
Create app.py
e67e2ef verified
import streamlit as st
import pandas as pd
from sqlalchemy import create_engine
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
# Database Connection with secrets for Hugging Face
# Using environment variables for credentials
host = "gateway01.eu-central-1.prod.aws.tidbcloud.com"
port = 4000
database = "grab"
# Load credentials from secrets
user = st.secrets["TIDB_USER"]
password = st.secrets["TIDB_PASSWORD"]
# For Hugging Face web deployment, remove SSL certificate path
# Use SSL mode instead of certificate file
engine = create_engine(
f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}",
connect_args={"ssl": {"ssl_mode": "REQUIRED"}}
)
#load_data
@st.cache_data(ttl=3600) # Cache for 1 hour
def load_data():
query = "SELECT * FROM movies"
try:
return pd.read_sql(query, engine)
except Exception as e:
st.error(f"Database connection error: {e}")
# Return sample data if connection fails
return pd.DataFrame({
'title': ['Sample Movie 1', 'Sample Movie 2'],
'genre': ['Action', 'Drama'],
'rating': [8.5, 7.9],
'votes': [100000, 80000],
'duration_minutes': [120, 95]
})
df = load_data()
# Check if data loaded successfully
if df.empty:
st.error("No data loaded from database. Check connection settings.")
st.stop()
#front_page
st.title("IMDb Movie Analytics Dashboard")
# Initialize session state
if 'dashboard' not in st.session_state:
st.session_state['dashboard'] = False
if not st.session_state['dashboard']:
if st.button("Go to Dashboard", type="primary"):
st.session_state['dashboard'] = True
st.rerun()
# Show some basic stats on front page
st.markdown("---")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Movies", len(df))
with col2:
st.metric("Unique Genres", df["genre"].nunique())
with col3:
st.metric("Avg Rating", f"{df['rating'].mean():.2f}")
st.markdown("### Quick Preview")
st.dataframe(df.head(10))
st.stop()
if st.session_state['dashboard']:
# Add a back button
if st.button("← Back to Home"):
st.session_state['dashboard'] = False
st.rerun()
#menu
st.sidebar.title("🎬 Navigation")
selected_tab = st.sidebar.radio(
"Select Section",
["Top 10 Movies", "Movie Analysis", "All Movies Data", "Data Analytics"]
)
#tab1_top10_movies
if selected_tab == "Top 10 Movies":
st.header("πŸ† Top 10 Movies")
genre_list = list(df["genre"].unique())
genre_select_mode = st.radio("Genre Filter", ["All Genres", "Custom Selection"], horizontal=True)
if genre_select_mode == "All Genres":
selected_top_genre = genre_list
st.info("Showing Top 10 Movies In All Genres")
else:
selected_top_genre = st.multiselect("Select Genres for Top 10 Movies", genre_list, default=genre_list[:3])
if not selected_top_genre:
st.warning("Select at least one genre to show")
st.stop()
top_df = df[df["genre"].isin(selected_top_genre)].copy()
name_col = None
for col in ["title", "name", "movie_name"]:
if col in top_df.columns:
name_col = col
break
if not name_col:
st.error("No valid movie name column found")
st.stop()
sort_option = st.radio("Sort Top 10 By", ["Rating", "Votes", "Rating & Votes"], horizontal=True)
if sort_option == "Rating":
sorted_df = top_df.sort_values(by="rating", ascending=False)
elif sort_option == "Votes":
sorted_df = top_df.sort_values(by="votes", ascending=False)
else:
top_df["score"] = top_df["rating"] * np.log(top_df["votes"] + 1)
sorted_df = top_df.sort_values(by="score", ascending=False)
top_movies = sorted_df.drop_duplicates(subset=name_col).head(10)
# Display results
for i, (_, row) in enumerate(top_movies.iterrows(), 1):
col1, col2 = st.columns([3, 1])
with col1:
st.markdown(f"**#{i} {row[name_col]}**")
st.markdown(f"Genre: {row['genre']} | Duration: {row.get('duration', 'N/A')} min")
with col2:
st.markdown(f"⭐ **{row['rating']:.1f}/10**")
st.markdown(f"πŸ‘₯ {row['votes']:,} votes")
st.divider()
# Also show as dataframe
with st.expander("πŸ“‹ View as Table"):
display_columns = [name_col, "genre","duration","rating", "votes"]
st.dataframe(top_movies[display_columns])
#tab2_Movie Analysis
elif selected_tab == "Movie Analysis":
st.header("πŸ“Š Movie Analysis")
# Create tabs for different analyses
analysis_tab1, analysis_tab2, analysis_tab3 = st.tabs(["Genre Analysis", "Ratings & Votes", "Duration Analysis"])
with analysis_tab1:
st.subheader("Genre Distribution")
genre_counts = df["genre"].value_counts().reset_index()
genre_counts.columns = ["Genre", "Count"]
f1, ax1 = plt.subplots(figsize=(10, 6))
sns.barplot(data=genre_counts, x="Genre", y="Count", palette="viridis", ax=ax1)
ax1.set_title("Number of Movies per Genre")
ax1.set_xlabel("Genre")
ax1.set_ylabel("Number of Movies")
ax1.tick_params(axis='x', rotation=45)
st.pyplot(f1)
st.subheader("Most Popular Genres by Voting")
total_votes_per_genre = df.groupby("genre")["votes"].sum().sort_values(ascending=False)
f5, ax5 = plt.subplots(figsize=(8, 8))
ax5.pie(total_votes_per_genre, labels=total_votes_per_genre.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("pastel"))
ax5.set_title("Most Popular Genres by Total Voting Counts")
ax5.axis("equal")
st.pyplot(f5)
with analysis_tab2:
#vote_trends
st.subheader("Voting Trends by Genre")
avg_votes = df.groupby("genre")["votes"].mean().sort_values(ascending=True).reset_index()
f3, ax3 = plt.subplots(figsize=(10, 6))
sns.barplot(data=avg_votes, x="votes", y="genre", palette="cubehelix", ax=ax3)
ax3.set_title("Average Voting Count per Genre")
ax3.set_xlabel("Average Votes")
ax3.set_ylabel("Genre")
st.pyplot(f3)
#rating_distribution
st.subheader("Rating Distribution")
f4, ax4 = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df, x="rating", color="lightcoral", ax=ax4)
ax4.set_title("Movie Ratings in Box plot")
ax4.set_xlabel("Rating")
st.pyplot(f4)
#heatmap
st.subheader("Ratings by Genre")
avg_rating_genre = df.groupby("genre")["rating"].mean().reset_index()
avg_rating_genre_pivot = avg_rating_genre.pivot_table(index="genre", values="rating")
f7, ax6 = plt.subplots(figsize=(8, len(avg_rating_genre_pivot) * 0.5 + 2))
sns.heatmap(avg_rating_genre_pivot, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax6)
ax6.set_title("Average Rating by Genre")
ax6.set_ylabel("Genre")
st.pyplot(f7)
#correlation
st.subheader("Correlation Analysis")
f8, ax7 = plt.subplots(figsize=(10, 6))
sns.scatterplot(data=df, x="votes", y="rating", hue="genre", alpha=0.7, palette="husl", ax=ax7)
ax7.set_title("Relationship Between Votes and Ratings")
ax7.set_xlabel("Votes")
ax7.set_ylabel("Rating")
ax7.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Genre")
st.pyplot(f8)
with analysis_tab3:
#movie_duration
st.subheader("Average Duration by Genre")
avg_duration = df.groupby("genre")["duration_minutes"].mean().sort_values(ascending=True).reset_index()
f2, ax2 = plt.subplots(figsize=(10, 6))
sns.barplot(data=avg_duration, x="duration_minutes", y="genre", palette="mako", ax=ax2)
ax2.set_title("Average Movie Duration per Genre")
ax2.set_xlabel("Average Duration (In Minutes)")
ax2.set_ylabel("Genre")
st.pyplot(f2)
#duration_distribution
st.subheader("Movie Duration Distribution")
f6, ax8 = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df, x="duration_minutes", color="skyblue", ax=ax8)
ax8.set_title("Movie Durations in Box plot")
ax8.set_xlabel("Duration (In Minutes)")
st.pyplot(f6)
#rating_leaders
st.subheader("Genre-Based Rating Leaders")
title_col = None
for col in ["title", "name", "movie_name"]:
if col in df.columns:
title_col = col
break
if title_col:
top_rated_per_genre = df.sort_values(by="rating", ascending=False).drop_duplicates(subset=["genre"])
top_rated_per_genre = top_rated_per_genre[["genre", title_col, "rating", "votes"]].sort_values(by="genre")
top_rated_per_genre.columns = ["Genre", "Top Movie", "Rating", "Votes"]
st.dataframe(top_rated_per_genre, use_container_width=True)
else:
st.warning("No title column found")
#duration_extremes
st.subheader("Duration Extremes")
if title_col:
valid_durations = df[df["duration_minutes"] > 0]
if not valid_durations.empty:
shortest = valid_durations.loc[valid_durations["duration_minutes"].idxmin()]
longest = df.loc[df["duration_minutes"].idxmax()]
def minutes_to_text(minutes):
h = minutes // 60
m = minutes % 60
return f"{int(h)}h {int(m)}m"
extremes_df = pd.DataFrame([
{
"Type": "Shortest",
"Title": shortest[title_col],
"Genre": shortest["genre"],
"Duration": minutes_to_text(shortest["duration_minutes"])
},
{
"Type": "Longest",
"Title": longest[title_col],
"Genre": longest["genre"],
"Duration": minutes_to_text(longest["duration_minutes"])
}
])
st.table(extremes_df)
else:
st.warning("No movie durations > 0")
else:
st.warning("Movie titles not found")
#tab3_allmovie_data
elif selected_tab == "All Movies Data":
st.header("🎞️ All Movies Data")
title_col = None
for col in ["movie_name", "duration", "rating", "votes"]:
if col in df.columns:
title_col = col
break
if not title_col:
st.error("No movie titles column found")
else:
display_cols = [title_col, "duration", "rating", "votes"]
selected_all_genre = st.selectbox(
"Select Genre to View All Movies",
["All Genres"] + list(df["genre"].unique())
)
if selected_all_genre == "All Genres":
total_count = len(df)
st.markdown(f"**Total Movies:** {total_count}")
st.dataframe(df[display_cols])
else:
filtered_df = df[df["genre"] == selected_all_genre]
total_count = len(filtered_df)
st.markdown(f"**Total Movies in {selected_all_genre}:** {total_count}")
st.dataframe(filtered_df[display_cols])
#tab4_data_analytics
elif selected_tab == "Data Analytics":
st.header("πŸ” Data Analytics")
st.sidebar.header("Custom Filters")
genre_filter_mode = st.sidebar.radio("Genre Filter Mode", ["All Genres", "Custom Selection"])
if genre_filter_mode == "All Genres":
selected_genre = df["genre"].unique().tolist()
else:
selected_genre = st.sidebar.multiselect("Select Genre(s)", df["genre"].unique().tolist(), default=df["genre"].unique().tolist()[:3])
if not selected_genre:
st.sidebar.warning("Please select at least one genre to apply filters")
duration_filter = st.sidebar.slider("Select Movie Duration (Minutes)", 0, 300, (90, 180))
rating_filter = st.sidebar.slider("Select Minimum Rating", 0.0, 10.0, 7.0)
votes_filter = st.sidebar.slider("Select Minimum Votes", 0, 500000, 10000)
#filters
filtered_df = df[
(df["duration_minutes"].between(duration_filter[0], duration_filter[1])) &
(df["rating"] >= rating_filter) &
(df["votes"] >= votes_filter) &
(df["genre"].isin(selected_genre))
]
#filter_view
st.subheader("Filtered Movies")
st.write(f"Showing {len(filtered_df)} movies matching your filters")
if not filtered_df.empty:
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Avg Rating", f"{filtered_df['rating'].mean():.2f}")
with col2:
st.metric("Avg Duration", f"{filtered_df['duration_minutes'].mean():.1f} min")
with col3:
st.metric("Avg Votes", f"{filtered_df['votes'].mean():,.0f}")
st.dataframe(filtered_df)
else:
st.warning("No movies match the selected filters. Try adjusting your criteria.")
# Footer
st.markdown("---")
st.markdown("""
<div style='text-align: center'>
<p>🎬 IMDb Movie Analytics Dashboard | Data from TiDB Cloud | Built with Streamlit</p>
</div>
""", unsafe_allow_html=True)