Spaces:

prasanthr0416
/

IMDB_2024_Data_Analytics

Sleeping

App Files Files Community

IMDB_2024_Data_Analytics / app.py

prasanthr0416

Create app.py

e67e2ef verified about 1 month ago

raw

history blame contribute delete

14.4 kB

	import streamlit as st
	import pandas as pd
	from sqlalchemy import create_engine
	import seaborn as sns
	import matplotlib.pyplot as plt
	import numpy as np
	import os

	# Database Connection with secrets for Hugging Face
	# Using environment variables for credentials
	host = "gateway01.eu-central-1.prod.aws.tidbcloud.com"
	port = 4000
	database = "grab"

	# Load credentials from secrets
	user = st.secrets["TIDB_USER"]
	password = st.secrets["TIDB_PASSWORD"]

	# For Hugging Face web deployment, remove SSL certificate path
	# Use SSL mode instead of certificate file
	engine = create_engine(
	f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}",
	connect_args={"ssl": {"ssl_mode": "REQUIRED"}}
	)

	#load_data
	@st.cache_data(ttl=3600) # Cache for 1 hour
	def load_data():
	query = "SELECT * FROM movies"
	try:
	return pd.read_sql(query, engine)
	except Exception as e:
	st.error(f"Database connection error: {e}")
	# Return sample data if connection fails
	return pd.DataFrame({
	'title': ['Sample Movie 1', 'Sample Movie 2'],
	'genre': ['Action', 'Drama'],
	'rating': [8.5, 7.9],
	'votes': [100000, 80000],
	'duration_minutes': [120, 95]
	})

	df = load_data()

	# Check if data loaded successfully
	if df.empty:
	st.error("No data loaded from database. Check connection settings.")
	st.stop()

	#front_page
	st.title("IMDb Movie Analytics Dashboard")

	# Initialize session state
	if 'dashboard' not in st.session_state:
	st.session_state['dashboard'] = False

	if not st.session_state['dashboard']:
	if st.button("Go to Dashboard", type="primary"):
	st.session_state['dashboard'] = True
	st.rerun()

	# Show some basic stats on front page
	st.markdown("---")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Total Movies", len(df))
	with col2:
	st.metric("Unique Genres", df["genre"].nunique())
	with col3:
	st.metric("Avg Rating", f"{df['rating'].mean():.2f}")

	st.markdown("### Quick Preview")
	st.dataframe(df.head(10))

	st.stop()

	if st.session_state['dashboard']:
	# Add a back button
	if st.button("← Back to Home"):
	st.session_state['dashboard'] = False
	st.rerun()

	#menu
	st.sidebar.title("🎬 Navigation")
	selected_tab = st.sidebar.radio(
	"Select Section",
	["Top 10 Movies", "Movie Analysis", "All Movies Data", "Data Analytics"]
	)

	#tab1_top10_movies
	if selected_tab == "Top 10 Movies":
	st.header("🏆 Top 10 Movies")

	genre_list = list(df["genre"].unique())
	genre_select_mode = st.radio("Genre Filter", ["All Genres", "Custom Selection"], horizontal=True)

	if genre_select_mode == "All Genres":
	selected_top_genre = genre_list
	st.info("Showing Top 10 Movies In All Genres")
	else:
	selected_top_genre = st.multiselect("Select Genres for Top 10 Movies", genre_list, default=genre_list[:3])
	if not selected_top_genre:
	st.warning("Select at least one genre to show")
	st.stop()

	top_df = df[df["genre"].isin(selected_top_genre)].copy()

	name_col = None
	for col in ["title", "name", "movie_name"]:
	if col in top_df.columns:
	name_col = col
	break
	if not name_col:
	st.error("No valid movie name column found")
	st.stop()

	sort_option = st.radio("Sort Top 10 By", ["Rating", "Votes", "Rating & Votes"], horizontal=True)

	if sort_option == "Rating":
	sorted_df = top_df.sort_values(by="rating", ascending=False)
	elif sort_option == "Votes":
	sorted_df = top_df.sort_values(by="votes", ascending=False)
	else:
	top_df["score"] = top_df["rating"] * np.log(top_df["votes"] + 1)
	sorted_df = top_df.sort_values(by="score", ascending=False)

	top_movies = sorted_df.drop_duplicates(subset=name_col).head(10)

	# Display results
	for i, (_, row) in enumerate(top_movies.iterrows(), 1):
	col1, col2 = st.columns([3, 1])
	with col1:
	st.markdown(f"#{i} {row[name_col]}")
	st.markdown(f"Genre: {row['genre']} \| Duration: {row.get('duration', 'N/A')} min")
	with col2:
	st.markdown(f"⭐ {row['rating']:.1f}/10")
	st.markdown(f"👥 {row['votes']:,} votes")
	st.divider()

	# Also show as dataframe
	with st.expander("📋 View as Table"):
	display_columns = [name_col, "genre","duration","rating", "votes"]
	st.dataframe(top_movies[display_columns])

	#tab2_Movie Analysis
	elif selected_tab == "Movie Analysis":
	st.header("📊 Movie Analysis")

	# Create tabs for different analyses
	analysis_tab1, analysis_tab2, analysis_tab3 = st.tabs(["Genre Analysis", "Ratings & Votes", "Duration Analysis"])

	with analysis_tab1:
	st.subheader("Genre Distribution")
	genre_counts = df["genre"].value_counts().reset_index()
	genre_counts.columns = ["Genre", "Count"]

	f1, ax1 = plt.subplots(figsize=(10, 6))
	sns.barplot(data=genre_counts, x="Genre", y="Count", palette="viridis", ax=ax1)
	ax1.set_title("Number of Movies per Genre")
	ax1.set_xlabel("Genre")
	ax1.set_ylabel("Number of Movies")
	ax1.tick_params(axis='x', rotation=45)
	st.pyplot(f1)

	st.subheader("Most Popular Genres by Voting")
	total_votes_per_genre = df.groupby("genre")["votes"].sum().sort_values(ascending=False)
	f5, ax5 = plt.subplots(figsize=(8, 8))
	ax5.pie(total_votes_per_genre, labels=total_votes_per_genre.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("pastel"))
	ax5.set_title("Most Popular Genres by Total Voting Counts")
	ax5.axis("equal")
	st.pyplot(f5)

	with analysis_tab2:
	#vote_trends
	st.subheader("Voting Trends by Genre")
	avg_votes = df.groupby("genre")["votes"].mean().sort_values(ascending=True).reset_index()

	f3, ax3 = plt.subplots(figsize=(10, 6))
	sns.barplot(data=avg_votes, x="votes", y="genre", palette="cubehelix", ax=ax3)
	ax3.set_title("Average Voting Count per Genre")
	ax3.set_xlabel("Average Votes")
	ax3.set_ylabel("Genre")
	st.pyplot(f3)

	#rating_distribution
	st.subheader("Rating Distribution")
	f4, ax4 = plt.subplots(figsize=(10, 6))
	sns.boxplot(data=df, x="rating", color="lightcoral", ax=ax4)
	ax4.set_title("Movie Ratings in Box plot")
	ax4.set_xlabel("Rating")
	st.pyplot(f4)

	#heatmap
	st.subheader("Ratings by Genre")
	avg_rating_genre = df.groupby("genre")["rating"].mean().reset_index()
	avg_rating_genre_pivot = avg_rating_genre.pivot_table(index="genre", values="rating")

	f7, ax6 = plt.subplots(figsize=(8, len(avg_rating_genre_pivot) * 0.5 + 2))
	sns.heatmap(avg_rating_genre_pivot, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax6)
	ax6.set_title("Average Rating by Genre")
	ax6.set_ylabel("Genre")
	st.pyplot(f7)

	#correlation
	st.subheader("Correlation Analysis")
	f8, ax7 = plt.subplots(figsize=(10, 6))
	sns.scatterplot(data=df, x="votes", y="rating", hue="genre", alpha=0.7, palette="husl", ax=ax7)
	ax7.set_title("Relationship Between Votes and Ratings")
	ax7.set_xlabel("Votes")
	ax7.set_ylabel("Rating")
	ax7.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Genre")
	st.pyplot(f8)

	with analysis_tab3:
	#movie_duration
	st.subheader("Average Duration by Genre")
	avg_duration = df.groupby("genre")["duration_minutes"].mean().sort_values(ascending=True).reset_index()

	f2, ax2 = plt.subplots(figsize=(10, 6))
	sns.barplot(data=avg_duration, x="duration_minutes", y="genre", palette="mako", ax=ax2)
	ax2.set_title("Average Movie Duration per Genre")
	ax2.set_xlabel("Average Duration (In Minutes)")
	ax2.set_ylabel("Genre")
	st.pyplot(f2)

	#duration_distribution
	st.subheader("Movie Duration Distribution")
	f6, ax8 = plt.subplots(figsize=(10, 6))
	sns.boxplot(data=df, x="duration_minutes", color="skyblue", ax=ax8)
	ax8.set_title("Movie Durations in Box plot")
	ax8.set_xlabel("Duration (In Minutes)")
	st.pyplot(f6)

	#rating_leaders
	st.subheader("Genre-Based Rating Leaders")
	title_col = None
	for col in ["title", "name", "movie_name"]:
	if col in df.columns:
	title_col = col
	break

	if title_col:
	top_rated_per_genre = df.sort_values(by="rating", ascending=False).drop_duplicates(subset=["genre"])
	top_rated_per_genre = top_rated_per_genre[["genre", title_col, "rating", "votes"]].sort_values(by="genre")
	top_rated_per_genre.columns = ["Genre", "Top Movie", "Rating", "Votes"]
	st.dataframe(top_rated_per_genre, use_container_width=True)
	else:
	st.warning("No title column found")

	#duration_extremes
	st.subheader("Duration Extremes")
	if title_col:
	valid_durations = df[df["duration_minutes"] > 0]

	if not valid_durations.empty:
	shortest = valid_durations.loc[valid_durations["duration_minutes"].idxmin()]
	longest = df.loc[df["duration_minutes"].idxmax()]

	def minutes_to_text(minutes):
	h = minutes // 60
	m = minutes % 60
	return f"{int(h)}h {int(m)}m"

	extremes_df = pd.DataFrame([
	{
	"Type": "Shortest",
	"Title": shortest[title_col],
	"Genre": shortest["genre"],
	"Duration": minutes_to_text(shortest["duration_minutes"])
	},
	{
	"Type": "Longest",
	"Title": longest[title_col],
	"Genre": longest["genre"],
	"Duration": minutes_to_text(longest["duration_minutes"])
	}
	])

	st.table(extremes_df)
	else:
	st.warning("No movie durations > 0")
	else:
	st.warning("Movie titles not found")

	#tab3_allmovie_data
	elif selected_tab == "All Movies Data":
	st.header("🎞️ All Movies Data")

	title_col = None
	for col in ["movie_name", "duration", "rating", "votes"]:
	if col in df.columns:
	title_col = col
	break

	if not title_col:
	st.error("No movie titles column found")
	else:
	display_cols = [title_col, "duration", "rating", "votes"]

	selected_all_genre = st.selectbox(
	"Select Genre to View All Movies",
	["All Genres"] + list(df["genre"].unique())
	)

	if selected_all_genre == "All Genres":
	total_count = len(df)
	st.markdown(f"Total Movies: {total_count}")
	st.dataframe(df[display_cols])
	else:
	filtered_df = df[df["genre"] == selected_all_genre]
	total_count = len(filtered_df)
	st.markdown(f"Total Movies in {selected_all_genre}: {total_count}")
	st.dataframe(filtered_df[display_cols])

	#tab4_data_analytics
	elif selected_tab == "Data Analytics":
	st.header("🔍 Data Analytics")

	st.sidebar.header("Custom Filters")
	genre_filter_mode = st.sidebar.radio("Genre Filter Mode", ["All Genres", "Custom Selection"])

	if genre_filter_mode == "All Genres":
	selected_genre = df["genre"].unique().tolist()
	else:
	selected_genre = st.sidebar.multiselect("Select Genre(s)", df["genre"].unique().tolist(), default=df["genre"].unique().tolist()[:3])
	if not selected_genre:
	st.sidebar.warning("Please select at least one genre to apply filters")

	duration_filter = st.sidebar.slider("Select Movie Duration (Minutes)", 0, 300, (90, 180))
	rating_filter = st.sidebar.slider("Select Minimum Rating", 0.0, 10.0, 7.0)
	votes_filter = st.sidebar.slider("Select Minimum Votes", 0, 500000, 10000)

	#filters
	filtered_df = df[
	(df["duration_minutes"].between(duration_filter[0], duration_filter[1])) &
	(df["rating"] >= rating_filter) &
	(df["votes"] >= votes_filter) &
	(df["genre"].isin(selected_genre))
	]

	#filter_view
	st.subheader("Filtered Movies")
	st.write(f"Showing {len(filtered_df)} movies matching your filters")

	if not filtered_df.empty:
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Avg Rating", f"{filtered_df['rating'].mean():.2f}")
	with col2:
	st.metric("Avg Duration", f"{filtered_df['duration_minutes'].mean():.1f} min")
	with col3:
	st.metric("Avg Votes", f"{filtered_df['votes'].mean():,.0f}")

	st.dataframe(filtered_df)
	else:
	st.warning("No movies match the selected filters. Try adjusting your criteria.")

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style='text-align: center'>
	<p>🎬 IMDb Movie Analytics Dashboard \| Data from TiDB Cloud \| Built with Streamlit</p>
	</div>
	""", unsafe_allow_html=True)