|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
from sqlalchemy import create_engine |
|
|
import seaborn as sns |
|
|
import matplotlib.pyplot as plt |
|
|
import numpy as np |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
host = "gateway01.eu-central-1.prod.aws.tidbcloud.com" |
|
|
port = 4000 |
|
|
database = "grab" |
|
|
|
|
|
|
|
|
user = st.secrets["TIDB_USER"] |
|
|
password = st.secrets["TIDB_PASSWORD"] |
|
|
|
|
|
|
|
|
|
|
|
engine = create_engine( |
|
|
f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}", |
|
|
connect_args={"ssl": {"ssl_mode": "REQUIRED"}} |
|
|
) |
|
|
|
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
|
def load_data(): |
|
|
query = "SELECT * FROM movies" |
|
|
try: |
|
|
return pd.read_sql(query, engine) |
|
|
except Exception as e: |
|
|
st.error(f"Database connection error: {e}") |
|
|
|
|
|
return pd.DataFrame({ |
|
|
'title': ['Sample Movie 1', 'Sample Movie 2'], |
|
|
'genre': ['Action', 'Drama'], |
|
|
'rating': [8.5, 7.9], |
|
|
'votes': [100000, 80000], |
|
|
'duration_minutes': [120, 95] |
|
|
}) |
|
|
|
|
|
df = load_data() |
|
|
|
|
|
|
|
|
if df.empty: |
|
|
st.error("No data loaded from database. Check connection settings.") |
|
|
st.stop() |
|
|
|
|
|
|
|
|
st.title("IMDb Movie Analytics Dashboard") |
|
|
|
|
|
|
|
|
if 'dashboard' not in st.session_state: |
|
|
st.session_state['dashboard'] = False |
|
|
|
|
|
if not st.session_state['dashboard']: |
|
|
if st.button("Go to Dashboard", type="primary"): |
|
|
st.session_state['dashboard'] = True |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
col1, col2, col3 = st.columns(3) |
|
|
with col1: |
|
|
st.metric("Total Movies", len(df)) |
|
|
with col2: |
|
|
st.metric("Unique Genres", df["genre"].nunique()) |
|
|
with col3: |
|
|
st.metric("Avg Rating", f"{df['rating'].mean():.2f}") |
|
|
|
|
|
st.markdown("### Quick Preview") |
|
|
st.dataframe(df.head(10)) |
|
|
|
|
|
st.stop() |
|
|
|
|
|
if st.session_state['dashboard']: |
|
|
|
|
|
if st.button("β Back to Home"): |
|
|
st.session_state['dashboard'] = False |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
st.sidebar.title("π¬ Navigation") |
|
|
selected_tab = st.sidebar.radio( |
|
|
"Select Section", |
|
|
["Top 10 Movies", "Movie Analysis", "All Movies Data", "Data Analytics"] |
|
|
) |
|
|
|
|
|
|
|
|
if selected_tab == "Top 10 Movies": |
|
|
st.header("π Top 10 Movies") |
|
|
|
|
|
genre_list = list(df["genre"].unique()) |
|
|
genre_select_mode = st.radio("Genre Filter", ["All Genres", "Custom Selection"], horizontal=True) |
|
|
|
|
|
if genre_select_mode == "All Genres": |
|
|
selected_top_genre = genre_list |
|
|
st.info("Showing Top 10 Movies In All Genres") |
|
|
else: |
|
|
selected_top_genre = st.multiselect("Select Genres for Top 10 Movies", genre_list, default=genre_list[:3]) |
|
|
if not selected_top_genre: |
|
|
st.warning("Select at least one genre to show") |
|
|
st.stop() |
|
|
|
|
|
top_df = df[df["genre"].isin(selected_top_genre)].copy() |
|
|
|
|
|
name_col = None |
|
|
for col in ["title", "name", "movie_name"]: |
|
|
if col in top_df.columns: |
|
|
name_col = col |
|
|
break |
|
|
if not name_col: |
|
|
st.error("No valid movie name column found") |
|
|
st.stop() |
|
|
|
|
|
sort_option = st.radio("Sort Top 10 By", ["Rating", "Votes", "Rating & Votes"], horizontal=True) |
|
|
|
|
|
if sort_option == "Rating": |
|
|
sorted_df = top_df.sort_values(by="rating", ascending=False) |
|
|
elif sort_option == "Votes": |
|
|
sorted_df = top_df.sort_values(by="votes", ascending=False) |
|
|
else: |
|
|
top_df["score"] = top_df["rating"] * np.log(top_df["votes"] + 1) |
|
|
sorted_df = top_df.sort_values(by="score", ascending=False) |
|
|
|
|
|
top_movies = sorted_df.drop_duplicates(subset=name_col).head(10) |
|
|
|
|
|
|
|
|
for i, (_, row) in enumerate(top_movies.iterrows(), 1): |
|
|
col1, col2 = st.columns([3, 1]) |
|
|
with col1: |
|
|
st.markdown(f"**#{i} {row[name_col]}**") |
|
|
st.markdown(f"Genre: {row['genre']} | Duration: {row.get('duration', 'N/A')} min") |
|
|
with col2: |
|
|
st.markdown(f"β **{row['rating']:.1f}/10**") |
|
|
st.markdown(f"π₯ {row['votes']:,} votes") |
|
|
st.divider() |
|
|
|
|
|
|
|
|
with st.expander("π View as Table"): |
|
|
display_columns = [name_col, "genre","duration","rating", "votes"] |
|
|
st.dataframe(top_movies[display_columns]) |
|
|
|
|
|
|
|
|
elif selected_tab == "Movie Analysis": |
|
|
st.header("π Movie Analysis") |
|
|
|
|
|
|
|
|
analysis_tab1, analysis_tab2, analysis_tab3 = st.tabs(["Genre Analysis", "Ratings & Votes", "Duration Analysis"]) |
|
|
|
|
|
with analysis_tab1: |
|
|
st.subheader("Genre Distribution") |
|
|
genre_counts = df["genre"].value_counts().reset_index() |
|
|
genre_counts.columns = ["Genre", "Count"] |
|
|
|
|
|
f1, ax1 = plt.subplots(figsize=(10, 6)) |
|
|
sns.barplot(data=genre_counts, x="Genre", y="Count", palette="viridis", ax=ax1) |
|
|
ax1.set_title("Number of Movies per Genre") |
|
|
ax1.set_xlabel("Genre") |
|
|
ax1.set_ylabel("Number of Movies") |
|
|
ax1.tick_params(axis='x', rotation=45) |
|
|
st.pyplot(f1) |
|
|
|
|
|
st.subheader("Most Popular Genres by Voting") |
|
|
total_votes_per_genre = df.groupby("genre")["votes"].sum().sort_values(ascending=False) |
|
|
f5, ax5 = plt.subplots(figsize=(8, 8)) |
|
|
ax5.pie(total_votes_per_genre, labels=total_votes_per_genre.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("pastel")) |
|
|
ax5.set_title("Most Popular Genres by Total Voting Counts") |
|
|
ax5.axis("equal") |
|
|
st.pyplot(f5) |
|
|
|
|
|
with analysis_tab2: |
|
|
|
|
|
st.subheader("Voting Trends by Genre") |
|
|
avg_votes = df.groupby("genre")["votes"].mean().sort_values(ascending=True).reset_index() |
|
|
|
|
|
f3, ax3 = plt.subplots(figsize=(10, 6)) |
|
|
sns.barplot(data=avg_votes, x="votes", y="genre", palette="cubehelix", ax=ax3) |
|
|
ax3.set_title("Average Voting Count per Genre") |
|
|
ax3.set_xlabel("Average Votes") |
|
|
ax3.set_ylabel("Genre") |
|
|
st.pyplot(f3) |
|
|
|
|
|
|
|
|
st.subheader("Rating Distribution") |
|
|
f4, ax4 = plt.subplots(figsize=(10, 6)) |
|
|
sns.boxplot(data=df, x="rating", color="lightcoral", ax=ax4) |
|
|
ax4.set_title("Movie Ratings in Box plot") |
|
|
ax4.set_xlabel("Rating") |
|
|
st.pyplot(f4) |
|
|
|
|
|
|
|
|
st.subheader("Ratings by Genre") |
|
|
avg_rating_genre = df.groupby("genre")["rating"].mean().reset_index() |
|
|
avg_rating_genre_pivot = avg_rating_genre.pivot_table(index="genre", values="rating") |
|
|
|
|
|
f7, ax6 = plt.subplots(figsize=(8, len(avg_rating_genre_pivot) * 0.5 + 2)) |
|
|
sns.heatmap(avg_rating_genre_pivot, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax6) |
|
|
ax6.set_title("Average Rating by Genre") |
|
|
ax6.set_ylabel("Genre") |
|
|
st.pyplot(f7) |
|
|
|
|
|
|
|
|
st.subheader("Correlation Analysis") |
|
|
f8, ax7 = plt.subplots(figsize=(10, 6)) |
|
|
sns.scatterplot(data=df, x="votes", y="rating", hue="genre", alpha=0.7, palette="husl", ax=ax7) |
|
|
ax7.set_title("Relationship Between Votes and Ratings") |
|
|
ax7.set_xlabel("Votes") |
|
|
ax7.set_ylabel("Rating") |
|
|
ax7.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Genre") |
|
|
st.pyplot(f8) |
|
|
|
|
|
with analysis_tab3: |
|
|
|
|
|
st.subheader("Average Duration by Genre") |
|
|
avg_duration = df.groupby("genre")["duration_minutes"].mean().sort_values(ascending=True).reset_index() |
|
|
|
|
|
f2, ax2 = plt.subplots(figsize=(10, 6)) |
|
|
sns.barplot(data=avg_duration, x="duration_minutes", y="genre", palette="mako", ax=ax2) |
|
|
ax2.set_title("Average Movie Duration per Genre") |
|
|
ax2.set_xlabel("Average Duration (In Minutes)") |
|
|
ax2.set_ylabel("Genre") |
|
|
st.pyplot(f2) |
|
|
|
|
|
|
|
|
st.subheader("Movie Duration Distribution") |
|
|
f6, ax8 = plt.subplots(figsize=(10, 6)) |
|
|
sns.boxplot(data=df, x="duration_minutes", color="skyblue", ax=ax8) |
|
|
ax8.set_title("Movie Durations in Box plot") |
|
|
ax8.set_xlabel("Duration (In Minutes)") |
|
|
st.pyplot(f6) |
|
|
|
|
|
|
|
|
st.subheader("Genre-Based Rating Leaders") |
|
|
title_col = None |
|
|
for col in ["title", "name", "movie_name"]: |
|
|
if col in df.columns: |
|
|
title_col = col |
|
|
break |
|
|
|
|
|
if title_col: |
|
|
top_rated_per_genre = df.sort_values(by="rating", ascending=False).drop_duplicates(subset=["genre"]) |
|
|
top_rated_per_genre = top_rated_per_genre[["genre", title_col, "rating", "votes"]].sort_values(by="genre") |
|
|
top_rated_per_genre.columns = ["Genre", "Top Movie", "Rating", "Votes"] |
|
|
st.dataframe(top_rated_per_genre, use_container_width=True) |
|
|
else: |
|
|
st.warning("No title column found") |
|
|
|
|
|
|
|
|
st.subheader("Duration Extremes") |
|
|
if title_col: |
|
|
valid_durations = df[df["duration_minutes"] > 0] |
|
|
|
|
|
if not valid_durations.empty: |
|
|
shortest = valid_durations.loc[valid_durations["duration_minutes"].idxmin()] |
|
|
longest = df.loc[df["duration_minutes"].idxmax()] |
|
|
|
|
|
def minutes_to_text(minutes): |
|
|
h = minutes // 60 |
|
|
m = minutes % 60 |
|
|
return f"{int(h)}h {int(m)}m" |
|
|
|
|
|
extremes_df = pd.DataFrame([ |
|
|
{ |
|
|
"Type": "Shortest", |
|
|
"Title": shortest[title_col], |
|
|
"Genre": shortest["genre"], |
|
|
"Duration": minutes_to_text(shortest["duration_minutes"]) |
|
|
}, |
|
|
{ |
|
|
"Type": "Longest", |
|
|
"Title": longest[title_col], |
|
|
"Genre": longest["genre"], |
|
|
"Duration": minutes_to_text(longest["duration_minutes"]) |
|
|
} |
|
|
]) |
|
|
|
|
|
st.table(extremes_df) |
|
|
else: |
|
|
st.warning("No movie durations > 0") |
|
|
else: |
|
|
st.warning("Movie titles not found") |
|
|
|
|
|
|
|
|
elif selected_tab == "All Movies Data": |
|
|
st.header("ποΈ All Movies Data") |
|
|
|
|
|
title_col = None |
|
|
for col in ["movie_name", "duration", "rating", "votes"]: |
|
|
if col in df.columns: |
|
|
title_col = col |
|
|
break |
|
|
|
|
|
if not title_col: |
|
|
st.error("No movie titles column found") |
|
|
else: |
|
|
display_cols = [title_col, "duration", "rating", "votes"] |
|
|
|
|
|
selected_all_genre = st.selectbox( |
|
|
"Select Genre to View All Movies", |
|
|
["All Genres"] + list(df["genre"].unique()) |
|
|
) |
|
|
|
|
|
if selected_all_genre == "All Genres": |
|
|
total_count = len(df) |
|
|
st.markdown(f"**Total Movies:** {total_count}") |
|
|
st.dataframe(df[display_cols]) |
|
|
else: |
|
|
filtered_df = df[df["genre"] == selected_all_genre] |
|
|
total_count = len(filtered_df) |
|
|
st.markdown(f"**Total Movies in {selected_all_genre}:** {total_count}") |
|
|
st.dataframe(filtered_df[display_cols]) |
|
|
|
|
|
|
|
|
elif selected_tab == "Data Analytics": |
|
|
st.header("π Data Analytics") |
|
|
|
|
|
st.sidebar.header("Custom Filters") |
|
|
genre_filter_mode = st.sidebar.radio("Genre Filter Mode", ["All Genres", "Custom Selection"]) |
|
|
|
|
|
if genre_filter_mode == "All Genres": |
|
|
selected_genre = df["genre"].unique().tolist() |
|
|
else: |
|
|
selected_genre = st.sidebar.multiselect("Select Genre(s)", df["genre"].unique().tolist(), default=df["genre"].unique().tolist()[:3]) |
|
|
if not selected_genre: |
|
|
st.sidebar.warning("Please select at least one genre to apply filters") |
|
|
|
|
|
duration_filter = st.sidebar.slider("Select Movie Duration (Minutes)", 0, 300, (90, 180)) |
|
|
rating_filter = st.sidebar.slider("Select Minimum Rating", 0.0, 10.0, 7.0) |
|
|
votes_filter = st.sidebar.slider("Select Minimum Votes", 0, 500000, 10000) |
|
|
|
|
|
|
|
|
filtered_df = df[ |
|
|
(df["duration_minutes"].between(duration_filter[0], duration_filter[1])) & |
|
|
(df["rating"] >= rating_filter) & |
|
|
(df["votes"] >= votes_filter) & |
|
|
(df["genre"].isin(selected_genre)) |
|
|
] |
|
|
|
|
|
|
|
|
st.subheader("Filtered Movies") |
|
|
st.write(f"Showing {len(filtered_df)} movies matching your filters") |
|
|
|
|
|
if not filtered_df.empty: |
|
|
col1, col2, col3 = st.columns(3) |
|
|
with col1: |
|
|
st.metric("Avg Rating", f"{filtered_df['rating'].mean():.2f}") |
|
|
with col2: |
|
|
st.metric("Avg Duration", f"{filtered_df['duration_minutes'].mean():.1f} min") |
|
|
with col3: |
|
|
st.metric("Avg Votes", f"{filtered_df['votes'].mean():,.0f}") |
|
|
|
|
|
st.dataframe(filtered_df) |
|
|
else: |
|
|
st.warning("No movies match the selected filters. Try adjusting your criteria.") |
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown(""" |
|
|
<div style='text-align: center'> |
|
|
<p>π¬ IMDb Movie Analytics Dashboard | Data from TiDB Cloud | Built with Streamlit</p> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |