import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
# ─────────────────────────────────────────────────────────────────────────────
# PAGE CONFIG
# ─────────────────────────────────────────────────────────────────────────────
st.set_page_config(
page_title="Netflix Analytics 2.0",
page_icon="🎬",
layout="wide",
initial_sidebar_state="expanded",
)
# ─────────────────────────────────────────────────────────────────────────────
# THEME CONSTANTS
# ─────────────────────────────────────────────────────────────────────────────
NF_RED = "#E50914"
NF_DARK = "#0a0a0a"
NF_CARD = "#181818"
NF_GRAY = "#222222"
NF_BORDER = "#2f2f2f"
PURPLE = "#6C5CE7"
TEAL = "#00B4D8"
GOLD = "#F39C12"
GREEN = "#00B894"
PINK = "#E84393"
TEXT_PRI = "#FFFFFF"
TEXT_MUT = "#9e9e9e"
TEXT_DIM = "#555555"
CHART_COLORS = [NF_RED, PURPLE, TEAL, GOLD, GREEN, PINK, "#A29BFE", "#55EFC4", "#FD79A8"]
def hex_rgba(h, a=0.15):
h = h.lstrip("#")
r, g, b = int(h[0:2],16), int(h[2:4],16), int(h[4:6],16)
return f"rgba({r},{g},{b},{a})"
PLOTLY_BASE = dict(
paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)",
font=dict(family="DM Sans, sans-serif", color=TEXT_PRI, size=12),
xaxis=dict(gridcolor=NF_BORDER, linecolor=NF_BORDER, tickcolor=TEXT_MUT),
yaxis=dict(gridcolor=NF_BORDER, linecolor=NF_BORDER, tickcolor=TEXT_MUT),
colorway=CHART_COLORS,
legend=dict(bgcolor="rgba(0,0,0,0)", font=dict(color=TEXT_PRI)),
margin=dict(l=10, r=10, t=40, b=10),
title=dict(font=dict(size=14, color=TEXT_PRI)),
)
# ─────────────────────────────────────────────────────────────────────────────
# CSS
# ─────────────────────────────────────────────────────────────────────────────
st.markdown(f"""
""", unsafe_allow_html=True)
# ─────────────────────────────────────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────────────────────────────────────
def apply_theme(fig, h=380):
fig.update_layout(**PLOTLY_BASE, height=h)
return fig
def sec(label):
st.markdown(f'
{label}
', unsafe_allow_html=True)
def insight(text, style=""):
st.markdown(f'',
unsafe_allow_html=True)
def kpi(icon, val, lbl, delta=None, dt="neu", accent=None):
style = f"--accent: linear-gradient(90deg,{accent},{accent}66);" if accent else ""
dhtml = f'{delta}
' if delta else ""
st.markdown(f"""
{icon}
{val}
{lbl}
{dhtml}
""", unsafe_allow_html=True)
def score_pills(tmdb=None, imdb=None, rt=None, mc=None):
pills = ""
if tmdb is not None and not pd.isna(tmdb):
pills += f'⭐ {tmdb:.1f}'
if imdb is not None and not pd.isna(imdb):
pills += f'🎬 IMDb {imdb:.1f}'
if rt is not None and not pd.isna(rt):
pills += f'🍅 {rt}%'
if mc is not None and not pd.isna(mc):
pills += f'🎯 MC {mc}'
st.markdown(pills, unsafe_allow_html=True)
def poster_gallery(df, title_col="title", year_col="release_year", rating_col="vote_average",
poster_col="poster_url", badge_col=None, badge_label="", max_cards=20,
extra_col=None, extra_label=""):
"""Render a horizontal poster card gallery."""
items = df.head(max_cards).to_dict("records")
cards_html = ''
placeholder = "https://via.placeholder.com/150x220/181818/555555?text=No+Image"
for row in items:
poster = row.get(poster_col) or ""
if not poster or poster == "None":
poster = placeholder
title = str(row.get(title_col, ""))[:40]
year = row.get(year_col, "")
rating = row.get(rating_col, "")
rating_str = f"⭐ {rating:.1f}" if isinstance(rating, (int,float)) and not pd.isna(rating) else ""
badge_html = ""
if badge_col and row.get(badge_col) is not None:
bval = row[badge_col]
if isinstance(bval, float): bval = f"{bval:.1f}"
badge_html = f'
{badge_label} #{bval}'
extra_html = ""
if extra_col and row.get(extra_col):
extra_html = f'
{extra_label} {row[extra_col]}'
cards_html += f"""
{title}
{year} {rating_str}
{badge_html}{extra_html}
"""
cards_html += "
"
st.markdown(cards_html, unsafe_allow_html=True)
# ─────────────────────────────────────────────────────────────────────────────
# HELPER: safe column selector
# ─────────────────────────────────────────────────────────────────────────────
def safe_cols(df, cols):
"""Return only columns that actually exist in df."""
return [c for c in cols if c in df.columns]
# ─────────────────────────────────────────────────────────────────────────────
# DATA LOADING
# ─────────────────────────────────────────────────────────────────────────────
BASE = "hf://datasets/ihhereanth/netflix_dataset/"
@st.cache_data(ttl=3600)
def load_data():
# ── FIX 1: safe_load with schema-mismatch fallback ───────────────────────
def safe_load(filename):
try:
return pd.read_parquet(BASE + filename)
except Exception as e:
# Fallback: try pyarrow directly with permissive cast
try:
import pyarrow.parquet as pq
table = pq.read_table(BASE + filename)
return table.to_pandas(strings_to_categorical=False)
except Exception as e2:
st.warning(f"⚠️ ไม่พบ {filename}: {e2}")
return pd.DataFrame()
def to_num(df, cols):
for c in cols:
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
return df
movies = safe_load("movies.parquet")
tv = safe_load("tv_shows.parquet")
credits = safe_load("credits.parquet")
keywords= safe_load("keywords.parquet")
m_num = ["vote_count","vote_average","runtime_min","budget_usd","revenue_usd","popularity",
"release_year","release_month","roi","imdb_rating","imdb_votes","rt_score",
"metacritic_score","audience_engagement_score","profit_usd","omdb_box_office",
"best_weekly_rank","critic_audience_gap","release_decade"]
tv_num = ["vote_count","vote_average","popularity","number_of_seasons","number_of_episodes",
"first_air_year","last_air_year","imdb_rating","imdb_votes","rt_score",
"metacritic_score","audience_engagement_score","votes_per_episode",
"total_content_hours","years_on_air","best_weekly_rank","first_air_decade"]
movies = to_num(movies, m_num)
tv = to_num(tv, tv_num)
# Derived for old pipeline compatibility
if "release_year" in movies.columns and "decade" not in movies.columns:
movies["decade"] = (movies["release_year"] // 10 * 10).astype("Int64").astype(str) + "s"
if "first_air_year" in tv.columns and "decade" not in tv.columns:
tv["decade"] = (tv["first_air_year"] // 10 * 10).astype("Int64").astype(str) + "s"
if "gender" in credits.columns:
credits["gender"] = credits["gender"].map({0:"Unknown",1:"Female",2:"Male"}).fillna("Unknown")
# New analytics tables
content_perf = safe_load("content_performance.parquet")
genre_perf = safe_load("genre_performance.parquet")
lang_summary = safe_load("language_summary.parquet")
top_talent = safe_load("top_talent.parquet")
yoy_trend = safe_load("yoy_trend.parquet")
franchises = safe_load("franchises.parquet")
tv_seasons = safe_load("tv_seasons.parquet")
num_cols_cp = ["vote_average","vote_count","popularity","imdb_rating","imdb_votes",
"rt_score","metacritic_score","audience_engagement_score",
"budget_usd","revenue_usd","roi","best_weekly_rank"]
content_perf = to_num(content_perf, num_cols_cp)
return (movies, tv, credits, keywords,
content_perf, genre_perf, lang_summary, top_talent, yoy_trend, franchises, tv_seasons)
with st.spinner("กำลังโหลดข้อมูล Netflix..."):
try:
(movies, tv, credits, keywords,
content_perf, genre_perf, lang_summary, top_talent, yoy_trend, franchises, tv_seasons) = load_data()
except Exception as e:
st.error(f"โหลดข้อมูลไม่สำเร็จ: {e}")
st.stop()
# ─────────────────────────────────────────────────────────────────────────────
# SIDEBAR FILTERS
# ─────────────────────────────────────────────────────────────────────────────
with st.sidebar:
st.markdown("""
NETFLIX
Analytics Dashboard 2.0
""", unsafe_allow_html=True)
st.markdown("### 🎛️ Filters")
# Media type
media_type_sel = st.radio("📽️ Media Type", ["All","Movies Only","TV Only"], horizontal=True)
# Genres from movies
all_genres = sorted({
g for genres in movies["genres"].dropna()
for g in (genres if isinstance(genres, list) else [])
}) if "genres" in movies.columns else []
selected_genres = st.multiselect("🎭 Genre", all_genres, default=[])
# Year range
y_min = int(movies["release_year"].min()) if "release_year" in movies.columns and len(movies)>0 else 1990
y_max = int(movies["release_year"].max()) if "release_year" in movies.columns and len(movies)>0 else 2024
year_range = st.slider("📅 Release Year (Movies)", y_min, y_max, (2010, y_max))
# Language
all_langs = []
if "original_language" in movies.columns:
all_langs = sorted(movies["original_language"].dropna().unique().tolist())
selected_langs = st.multiselect("🌍 Language", all_langs, default=[])
# Rating range
rating_min, rating_max = st.slider("⭐ Rating Range", 0.0, 10.0, (5.0, 10.0), step=0.5)
# Min votes
min_votes = st.slider("🗳️ Min Vote Count", 0, 5000, 100, step=50)
st.markdown("---")
st.markdown("### 🔬 Advanced Filters")
trending_only = st.checkbox("🔥 Trending Top 50 Only")
awards_only = st.checkbox("🏆 Awards Titles Only")
has_poster = st.checkbox("🖼️ With Poster Image", value=False)
if "budget_tier" in movies.columns:
budget_tiers = ["All"] + sorted(movies["budget_tier"].dropna().unique().tolist())
sel_budget = st.selectbox("💰 Budget Tier", budget_tiers)
else:
sel_budget = "All"
st.markdown("---")
st.markdown(f"""
TMDB → Airflow → PySpark → HuggingFace
Pipeline updated weekly
""", unsafe_allow_html=True)
# ─────────────────────────────────────────────────────────────────────────────
# APPLY FILTERS
# ─────────────────────────────────────────────────────────────────────────────
def apply_movie_filters(df):
if df.empty: return df
if selected_genres and "genres" in df.columns:
df = df[df["genres"].apply(lambda g: bool(set(g or []) & set(selected_genres)))]
if "release_year" in df.columns:
df = df[df["release_year"].between(*year_range)]
if selected_langs and "original_language" in df.columns:
df = df[df["original_language"].isin(selected_langs)]
if "vote_average" in df.columns:
df = df[df["vote_average"].between(rating_min, rating_max)]
if "vote_count" in df.columns:
df = df[df["vote_count"] >= min_votes]
if trending_only and "is_trending_top50" in df.columns:
df = df[df["is_trending_top50"] == True]
if awards_only and "has_awards" in df.columns:
df = df[df["has_awards"] == True]
if has_poster and "poster_url" in df.columns:
df = df[df["poster_url"].notna() & (df["poster_url"] != "None")]
if sel_budget != "All" and "budget_tier" in df.columns:
df = df[df["budget_tier"] == sel_budget]
return df
def apply_tv_filters(df):
if df.empty: return df
if selected_langs and "original_language" in df.columns:
df = df[df["original_language"].isin(selected_langs)]
if "vote_average" in df.columns:
df = df[df["vote_average"].between(rating_min, rating_max)]
if "vote_count" in df.columns:
df = df[df["vote_count"] >= min_votes]
if trending_only and "is_trending_top50" in df.columns:
df = df[df["is_trending_top50"] == True]
if awards_only and "has_awards" in df.columns:
df = df[df["has_awards"] == True]
if has_poster and "poster_url" in df.columns:
df = df[df["poster_url"].notna() & (df["poster_url"] != "None")]
return df
movies_f = apply_movie_filters(movies.copy())
tv_f = apply_tv_filters(tv.copy())
show_movies = media_type_sel != "TV Only"
show_tv = media_type_sel != "Movies Only"
# ─────────────────────────────────────────────────────────────────────────────
# HERO HEADER
# ─────────────────────────────────────────────────────────────────────────────
st.markdown("""
NETFLIX ANALYTICS 2.0
Content Intelligence Dashboard · TMDB + OMDB + Trending Pipeline
""", unsafe_allow_html=True)
st.markdown("---")
# ─────────────────────────────────────────────────────────────────────────────
# MAIN TABS
# ─────────────────────────────────────────────────────────────────────────────
tab_overview, tab_explorer, tab_engage, tab_genre, tab_movies, tab_tv, tab_talent, tab_trends = st.tabs([
"🏠 Overview",
"🔍 Explorer",
"📊 Engagement",
"🎭 Genres",
"🎬 Movies",
"📺 TV Shows",
"🌟 Talent",
"📈 Trends",
])
# ══════════════════════════════════════════════════════════════════════════════
# TAB 1: OVERVIEW
# ══════════════════════════════════════════════════════════════════════════════
with tab_overview:
sec("📊 KPI OVERVIEW")
avg_m = movies_f["vote_average"].mean() if "vote_average" in movies_f.columns and len(movies_f)>0 else 0
avg_tv = tv_f["vote_average"].mean() if "vote_average" in tv_f.columns and len(tv_f)>0 else 0
avg_eng = movies_f["audience_engagement_score"].mean() if "audience_engagement_score" in movies_f.columns and len(movies_f)>0 else 0
total_r = movies_f["revenue_usd"].sum() if "revenue_usd" in movies_f.columns else 0
total_b = movies_f["budget_usd"].sum() if "budget_usd" in movies_f.columns else 0
n_trend = int(movies_f["is_trending_top50"].sum()) if "is_trending_top50" in movies_f.columns else 0
n_award = int(movies_f["has_awards"].sum()) if "has_awards" in movies_f.columns else 0
n_oscar = int(movies_f["won_oscar"].sum()) if "won_oscar" in movies_f.columns else 0
c1,c2,c3,c4,c5,c6,c7,c8 = st.columns(8)
with c1: kpi("🎬",f"{len(movies_f):,}","Movies",accent=NF_RED)
with c2: kpi("📺",f"{len(tv_f):,}","TV Shows",accent=PURPLE)
with c3: kpi("⭐",f"{avg_m:.2f}","Avg Movie Rating",
delta=f"TV: {avg_tv:.2f}",
dt="pos" if avg_m>=avg_tv else "neg", accent=GOLD)
with c4: kpi("🔥",f"{avg_eng:.1f}","Avg Engagement",accent=NF_RED)
with c5: kpi("💰",f"${total_r/1e9:.1f}B","Total Revenue",
delta=f"ROI {total_r/max(total_b,1):.1f}x" if total_b>0 else None,
dt="pos", accent=GREEN)
with c6: kpi("📈",f"{n_trend:,}","Trending Movies",accent=TEAL)
with c7: kpi("🏆",f"{n_award:,}","Award Winners",accent=GOLD)
with c8: kpi("🎭",f"{n_oscar:,}","Oscar Winners",accent=GOLD)
st.markdown("---")
# Trending gallery with posters
sec("🔥 TRENDING NOW — TOP PICKS WITH POSTERS")
col_tl, col_tr = st.columns([3, 1], gap="large")
with col_tl:
if "best_weekly_rank" in movies_f.columns:
trending_movies = (movies_f[movies_f["best_weekly_rank"].notna()]
.sort_values("best_weekly_rank")
.head(20))
if not trending_movies.empty:
st.markdown("**🎬 Trending Movies (Weekly Rank)**")
poster_gallery(trending_movies, title_col="title", year_col="release_year",
rating_col="vote_average", poster_col="poster_url",
badge_col="best_weekly_rank", badge_label="Rank", max_cards=12)
else:
st.markdown("**🎬 Top Rated Movies**")
top_m = (movies_f[movies_f["vote_count"]>=200]
.nlargest(12,"vote_average")
.reset_index(drop=True))
poster_gallery(top_m, max_cards=12)
else:
st.markdown("**🎬 Top Rated Movies**")
top_m = (movies_f.nlargest(12,"vote_average") if "vote_average" in movies_f.columns
else movies_f.head(12))
poster_gallery(top_m, max_cards=12)
with col_tr:
sec("📌 HIGHLIGHTS")
if not movies_f.empty and "vote_average" in movies_f.columns:
vc_col = movies_f["vote_count"] if "vote_count" in movies_f.columns else pd.Series([999]*len(movies_f))
best = movies_f[vc_col>=200].nlargest(1,"vote_average") if "vote_count" in movies_f.columns else movies_f.nlargest(1,"vote_average")
if not best.empty:
best = best.iloc[0]
insight(f"🥇 Best Rated: {best.get('title','')}
⭐ {best.get('vote_average',0):.1f}/10", "red")
if "audience_engagement_score" in movies_f.columns and not movies_f.empty:
top_eng = movies_f.nlargest(1,"audience_engagement_score")
if not top_eng.empty:
te = top_eng.iloc[0]
insight(f"🔥 Highest Engagement: {te.get('title','')}
Score: {te.get('audience_engagement_score',0):.1f}", "teal")
if "won_oscar" in movies_f.columns:
oscars = movies_f[movies_f["won_oscar"]==True]
if not oscars.empty:
o = oscars.nlargest(1,"vote_average").iloc[0]
insight(f"🏆 Oscar Winner: {o.get('title','')}
⭐ {o.get('vote_average',0):.1f}", "gold")
if "imdb_votes" in movies_f.columns and not movies_f.empty:
most_voted = movies_f.nlargest(1,"imdb_votes")
if not most_voted.empty:
mv = most_voted.iloc[0]
votes = mv.get("imdb_votes",0)
insight(f"👥 Most Watched (IMDb Votes):
{mv.get('title','')}
{votes/1e6:.1f}M votes", "green")
st.markdown("---")
# Trending TV gallery
if show_tv and "best_weekly_rank" in tv_f.columns:
trending_tv = (tv_f[tv_f["best_weekly_rank"].notna()]
.sort_values("best_weekly_rank").head(12))
if not trending_tv.empty:
st.markdown("**📺 Trending TV Shows**")
poster_gallery(trending_tv, title_col="name", year_col="first_air_year",
rating_col="vote_average", poster_col="poster_url",
badge_col="best_weekly_rank", badge_label="Rank", max_cards=12)
st.markdown("---")
# Overview charts
col_ov1, col_ov2 = st.columns(2, gap="large")
with col_ov1:
sec("📅 CONTENT OVER TIME")
if not yoy_trend.empty and "year" in yoy_trend.columns:
yoy_m = yoy_trend[yoy_trend["media_type"]=="movie"].sort_values("year")
yoy_tv = yoy_trend[yoy_trend["media_type"]=="tv"].sort_values("year")
fig = go.Figure()
if not yoy_m.empty:
fig.add_trace(go.Scatter(x=yoy_m["year"], y=yoy_m["title_count"],
name="Movies", mode="lines+markers", fill="tozeroy",
line=dict(color=NF_RED,width=2),
fillcolor=hex_rgba(NF_RED,0.12)))
if not yoy_tv.empty:
fig.add_trace(go.Scatter(x=yoy_tv["year"], y=yoy_tv["title_count"],
name="TV Shows", mode="lines+markers", fill="tozeroy",
line=dict(color=PURPLE,width=2),
fillcolor=hex_rgba(PURPLE,0.12)))
fig.update_layout(xaxis_title="Year", yaxis_title="Titles Added")
apply_theme(fig)
st.plotly_chart(fig, use_container_width=True)
elif "release_year" in movies_f.columns:
by_year = movies_f.groupby("release_year").size().reset_index(name="count")
fig = px.area(by_year, x="release_year", y="count",
color_discrete_sequence=[NF_RED])
apply_theme(fig)
st.plotly_chart(fig, use_container_width=True)
with col_ov2:
sec("📊 RATING DISTRIBUTION")
fig2 = go.Figure()
if show_movies and "vote_average" in movies_f.columns and not movies_f.empty:
fig2.add_trace(go.Histogram(x=movies_f["vote_average"].dropna(),
name="Movies", nbinsx=25, marker_color=NF_RED, opacity=0.7, histnorm="percent"))
if show_tv and "vote_average" in tv_f.columns and not tv_f.empty:
fig2.add_trace(go.Histogram(x=tv_f["vote_average"].dropna(),
name="TV Shows", nbinsx=25, marker_color=PURPLE, opacity=0.7, histnorm="percent"))
fig2.update_layout(barmode="overlay", xaxis_title="Rating", yaxis_title="% of Titles")
apply_theme(fig2)
st.plotly_chart(fig2, use_container_width=True)
# ══════════════════════════════════════════════════════════════════════════════
# TAB 2: CONTENT EXPLORER
# ══════════════════════════════════════════════════════════════════════════════
with tab_explorer:
sec("🔍 CONTENT EXPLORER")
st.markdown("ค้นหาและสำรวจ Content พร้อมรูปปก และคะแนนจากทุกแหล่ง")
col_ex1, col_ex2, col_ex3, col_ex4 = st.columns([2,1,1,1])
with col_ex1:
search_q = st.text_input("🔎 ค้นหาชื่อ", placeholder="พิมพ์ชื่อหนัง / ซีรีส์...")
with col_ex2:
ex_type = st.selectbox("Type", ["Movies","TV Shows"])
with col_ex3:
ex_sort = st.selectbox("Sort by", [
"vote_average","audience_engagement_score","imdb_votes",
"popularity","release_year","revenue_usd","rt_score"
])
with col_ex4:
ex_limit = st.selectbox("Show", [24, 48, 96], index=0)
# Select working df
ex_df = movies_f.copy() if ex_type == "Movies" else tv_f.copy()
name_col = "title" if ex_type == "Movies" else "name"
year_col = "release_year" if ex_type == "Movies" else "first_air_year"
if search_q and name_col in ex_df.columns:
ex_df = ex_df[ex_df[name_col].fillna("").str.contains(search_q, case=False, na=False)]
# Only sort by columns that exist
if ex_sort in ex_df.columns:
ex_df = ex_df.sort_values(ex_sort, ascending=False)
st.caption(f"พบ {len(ex_df):,} รายการ")
# Grid view
view_mode = st.radio("📐 View", ["🖼️ Poster Grid", "📋 List View", "🗂️ Detail Card"], horizontal=True)
if view_mode == "🖼️ Poster Grid":
poster_gallery(ex_df.reset_index(drop=True), title_col=name_col, year_col=year_col,
rating_col="vote_average", poster_col="poster_url", max_cards=ex_limit)
elif view_mode == "📋 List View":
show_cols = safe_cols(ex_df, [name_col, year_col,
"vote_average","imdb_rating","rt_score","metacritic_score",
"audience_engagement_score","imdb_votes","popularity",
"original_language","rating_bucket","imdb_votes_tier"])
st.dataframe(ex_df[show_cols].head(ex_limit).reset_index(drop=True),
use_container_width=True, height=500)
else: # Detail Card
detail_sel = st.selectbox("เลือก Title", ex_df[name_col].dropna().head(200).tolist() if name_col in ex_df.columns else [])
if detail_sel and name_col in ex_df.columns:
row = ex_df[ex_df[name_col]==detail_sel].iloc[0]
col_dc1, col_dc2 = st.columns([1,3], gap="large")
with col_dc1:
poster = row.get("poster_url","")
if poster and poster != "None":
st.image(poster, width=220)
else:
st.markdown('No Image
', unsafe_allow_html=True)
if row.get("backdrop_url") and row["backdrop_url"] != "None":
with st.expander("🖼️ Backdrop"):
st.image(row["backdrop_url"], use_container_width=True)
with col_dc2:
title_display = row.get(name_col,"")
yr = row.get(year_col,"")
lang = row.get("original_language","")
st.markdown(f'{title_display}
', unsafe_allow_html=True)
st.markdown(f'📅 {yr} | 🌍 {lang.upper() if lang else ""}
', unsafe_allow_html=True)
st.markdown("**Multi-Source Scores:**")
score_pills(
tmdb=row.get("vote_average"),
imdb=row.get("imdb_rating"),
rt=row.get("rt_score"),
mc=row.get("metacritic_score")
)
# Tags
tag_html = ""
genres = row.get("genres",[]) or []
for g in (genres[:5] if isinstance(genres,list) else []):
tag_html += f'{g}'
for status_field in ["status","renewal_signal","critic_audience_verdict"]:
if row.get(status_field):
tag_html += f'{row[status_field]}'
if row.get("has_awards"):
tag_html += '🏆 Award Winner'
if row.get("won_oscar"):
tag_html += '🎭 Oscar Winner'
if tag_html:
st.markdown(tag_html, unsafe_allow_html=True)
# Overview
overview = row.get("overview","")
if overview and overview != "None":
st.markdown("---")
st.markdown(f"**📝 Overview**")
st.markdown(f'{overview}
',
unsafe_allow_html=True)
# Stats
st.markdown("---")
col_s1, col_s2, col_s3, col_s4 = st.columns(4)
with col_s1:
votes = row.get("vote_count",0) or 0
st.metric("TMDB Votes", f"{int(votes):,}" if pd.notna(votes) else "N/A")
with col_s2:
iv = row.get("imdb_votes",0) or 0
st.metric("IMDb Votes", f"{int(iv):,}" if pd.notna(iv) else "N/A")
with col_s3:
eng = row.get("audience_engagement_score")
st.metric("Engagement", f"{eng:.1f}" if pd.notna(eng) else "N/A")
with col_s4:
pop = row.get("popularity")
st.metric("Popularity", f"{pop:.0f}" if pd.notna(pop) else "N/A")
# Movie-specific
if ex_type == "Movies":
col_m1, col_m2, col_m3 = st.columns(3)
with col_m1:
rev = row.get("revenue_usd",0) or 0
st.metric("Revenue", f"${rev/1e6:.0f}M" if rev>0 else "N/A")
with col_m2:
roi = row.get("roi")
st.metric("ROI", f"{roi:.1f}x" if pd.notna(roi) else "N/A")
with col_m3:
rt = row.get("runtime_min")
st.metric("Runtime", f"{int(rt)} min" if pd.notna(rt) else "N/A")
else:
col_m1, col_m2, col_m3 = st.columns(3)
with col_m1:
s = row.get("number_of_seasons")
st.metric("Seasons", f"{int(s)}" if pd.notna(s) else "N/A")
with col_m2:
e = row.get("number_of_episodes")
st.metric("Episodes", f"{int(e)}" if pd.notna(e) else "N/A")
with col_m3:
h = row.get("total_content_hours")
st.metric("Total Hours", f"{h:.0f}h" if pd.notna(h) else "N/A")
# Trailer
yk = row.get("trailer_youtube_key")
if yk and yk != "None":
st.markdown("---")
st.markdown(f"**🎬 Trailer**")
st.video(f"https://www.youtube.com/watch?v={yk}")
# ══════════════════════════════════════════════════════════════════════════════
# TAB 3: ENGAGEMENT & VIEWERSHIP
# ══════════════════════════════════════════════════════════════════════════════
with tab_engage:
sec("📊 ENGAGEMENT & VIEWERSHIP ANALYTICS")
col_e1, col_e2 = st.columns(2, gap="large")
with col_e1:
st.markdown("**Audience Engagement Score Distribution**")
if "audience_engagement_score" in movies_f.columns and not movies_f.empty:
fig_eng = go.Figure()
if show_movies:
fig_eng.add_trace(go.Histogram(
x=movies_f["audience_engagement_score"].dropna(),
name="Movies", nbinsx=30, marker_color=NF_RED, opacity=0.75, histnorm="percent"))
if show_tv and "audience_engagement_score" in tv_f.columns:
fig_eng.add_trace(go.Histogram(
x=tv_f["audience_engagement_score"].dropna(),
name="TV Shows", nbinsx=30, marker_color=PURPLE, opacity=0.75, histnorm="percent"))
fig_eng.update_layout(barmode="overlay", xaxis_title="Engagement Score", yaxis_title="%")
apply_theme(fig_eng, 340)
st.plotly_chart(fig_eng, use_container_width=True)
with col_e2:
st.markdown("**IMDb Votes Tier (Viewership Proxy)**")
if "imdb_votes_tier" in movies_f.columns and not movies_f.empty:
tier_order = ["Mega (1M+)","Hit (500K+)","Popular (100K+)","Moderate (10K+)","Niche (<10K)"]
tier_counts = (movies_f["imdb_votes_tier"].value_counts()
.reindex(tier_order, fill_value=0).reset_index())
tier_counts.columns = ["tier","count"]
fig_tier = px.bar(tier_counts, x="count", y="tier", orientation="h",
color="count",
color_continuous_scale=["#1a0505", NF_RED],
text="count", labels={"count":"Movies","tier":""})
fig_tier.update_traces(texttemplate="%{text:,}", textposition="outside")
fig_tier.update_layout(yaxis={"categoryorder":"array","categoryarray":tier_order[::-1]},
coloraxis_showscale=False)
apply_theme(fig_tier, 340)
st.plotly_chart(fig_tier, use_container_width=True)
st.markdown("---")
# Scatter: Engagement vs IMDb Votes
col_e3, col_e4 = st.columns([2,1], gap="large")
with col_e3:
st.markdown("**Engagement Score vs IMDb Votes (Viewership)**")
if all(c in movies_f.columns for c in ["audience_engagement_score","imdb_votes","title"]):
sc_df = movies_f[
movies_f["imdb_votes"].notna() & movies_f["audience_engagement_score"].notna()
].copy()
if not sc_df.empty:
sc_df["size_norm"] = sc_df["vote_count"].fillna(100).clip(100, 50000) if "vote_count" in sc_df.columns else 100
fig_sc = px.scatter(
sc_df, x="imdb_votes", y="audience_engagement_score",
color="vote_average" if "vote_average" in sc_df.columns else "imdb_votes",
hover_name="title",
size="size_norm", size_max=25,
log_x=True,
color_continuous_scale=["#6C1F1F", NF_RED, GOLD, GREEN],
labels={"imdb_votes":"IMDb Votes (log)","audience_engagement_score":"Engagement Score"},
)
apply_theme(fig_sc, 400)
st.plotly_chart(fig_sc, use_container_width=True)
with col_e4:
st.markdown("**📌 Engagement Insights**")
if "audience_engagement_score" in movies_f.columns and not movies_f.empty:
avg_eng = movies_f["audience_engagement_score"].mean()
top10 = movies_f.nlargest(10,"audience_engagement_score")["audience_engagement_score"].mean()
insight(f"Avg Engagement Score: {avg_eng:.1f}/10
Top 10 avg: {top10:.1f}", "red")
if "imdb_votes" in movies_f.columns and not movies_f.empty:
mega = (movies_f["imdb_votes"] >= 1_000_000).sum()
insight(f"Movies ระดับ Mega (1M+ IMDb Votes): {mega} เรื่อง
= ฐานผู้ชมขนาดใหญ่มาก", "teal")
if "critic_audience_verdict" in movies_f.columns and not movies_f.empty:
verdict_vc = movies_f["critic_audience_verdict"].value_counts()
for v, cnt in verdict_vc.items():
insight(f"{v}: {cnt:,} เรื่อง")
# Critic vs Audience Divergence
st.markdown("---")
sec("🎯 CRITIC VS AUDIENCE DIVERGENCE")
col_div1, col_div2 = st.columns([3,1], gap="large")
with col_div1:
if all(c in movies_f.columns for c in ["rt_score","vote_average","title"]):
div_df = movies_f[
movies_f["rt_score"].notna() & movies_f["vote_average"].notna()
].copy()
div_df["critic_gap"] = (div_df["rt_score"]/10) - div_df["vote_average"]
fig_div = px.scatter(
div_df.sample(min(500,len(div_df))),
x="vote_average", y="rt_score",
color="critic_audience_verdict" if "critic_audience_verdict" in div_df.columns else "vote_average",
hover_name="title",
color_discrete_map={"Critics Favorite":PURPLE,"Audience Favorite":NF_RED,"Consensus":TEAL},
labels={"vote_average":"TMDB Rating","rt_score":"Rotten Tomatoes (%)"},
)
fig_div.add_shape(type="line", x0=0, y0=0, x1=10, y1=100,
line=dict(color=NF_BORDER, dash="dash", width=1))
apply_theme(fig_div, 380)
st.plotly_chart(fig_div, use_container_width=True)
with col_div2:
if "critic_audience_verdict" in movies_f.columns and not movies_f.empty:
vc = movies_f["critic_audience_verdict"].value_counts()
for v, cnt in vc.items():
style = "teal" if "Critics" in str(v) else ("red" if "Audience" in str(v) else "")
icon = "🎬" if "Critics" in str(v) else ("🍿" if "Audience" in str(v) else "✅")
insight(f"{icon} {v}: {cnt:,} movies", style)
# ══════════════════════════════════════════════════════════════════════════════
# TAB 4: GENRE INTELLIGENCE
# ══════════════════════════════════════════════════════════════════════════════
with tab_genre:
sec("🎭 GENRE INTELLIGENCE")
if not genre_perf.empty:
col_gp1, col_gp2 = st.columns(2, gap="large")
gp_movies = genre_perf[genre_perf["media_type"]=="movie"].copy() if "media_type" in genre_perf.columns else genre_perf.copy()
gp_tv = genre_perf[genre_perf["media_type"]=="tv"].copy() if "media_type" in genre_perf.columns else pd.DataFrame()
with col_gp1:
st.markdown("**🎬 Movie Genres — Viewership (Total IMDb Votes)**")
if not gp_movies.empty and "total_imdb_votes" in gp_movies.columns:
top_gm = gp_movies.nlargest(15,"total_imdb_votes")
fig_gv = px.bar(top_gm, x="total_imdb_votes", y="genre", orientation="h",
color="avg_vote_average",
color_continuous_scale=["#3D0000",NF_RED,GOLD,GREEN],
text="total_imdb_votes",
labels={"total_imdb_votes":"Total IMDb Votes","genre":""})
fig_gv.update_traces(texttemplate="%{text:,.0f}", textposition="outside")
fig_gv.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=True,
coloraxis_colorbar=dict(title="Avg Rating", len=0.6))
apply_theme(fig_gv, 450)
st.plotly_chart(fig_gv, use_container_width=True)
with col_gp2:
st.markdown("**📺 TV Genres — Engagement Score**")
if not gp_tv.empty and "avg_engagement" in gp_tv.columns:
top_gt = gp_tv.nlargest(15,"avg_engagement")
fig_ge = px.bar(top_gt, x="avg_engagement", y="genre", orientation="h",
color="avg_engagement",
color_continuous_scale=["#1a0040",PURPLE,"#A29BFE"],
text="avg_engagement",
labels={"avg_engagement":"Avg Engagement Score","genre":""})
fig_ge.update_traces(texttemplate="%{text:.2f}", textposition="outside")
fig_ge.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_ge, 450)
st.plotly_chart(fig_ge, use_container_width=True)
st.markdown("---")
# Bubble chart: Volume vs Rating vs Viewership
st.markdown("**Genre Bubble: Volume × Rating × Viewership**")
if not gp_movies.empty and all(c in gp_movies.columns for c in ["genre","title_count","avg_vote_average","avg_imdb_votes"]):
gp_bubble = gp_movies.dropna(subset=["avg_imdb_votes"]).head(20)
fig_bub = px.scatter(
gp_bubble, x="title_count", y="avg_vote_average",
size="avg_imdb_votes", color="genre",
hover_name="genre",
color_discrete_sequence=CHART_COLORS,
size_max=60,
labels={"title_count":"Number of Titles","avg_vote_average":"Avg Rating"},
text="genre"
)
fig_bub.update_traces(textposition="top center", textfont_size=10)
apply_theme(fig_bub, 420)
st.plotly_chart(fig_bub, use_container_width=True)
else:
# Fallback to computed genres
col_g1, col_g2 = st.columns(2, gap="large")
with col_g1:
if "genres" in movies_f.columns and not movies_f.empty:
gc = (movies_f.explode("genres").groupby("genres")["title"]
.count().reset_index().rename(columns={"title":"count","genres":"genre"})
.sort_values("count",ascending=False).head(15))
fig_gc = px.bar(gc, x="count", y="genre", orientation="h",
color="count", color_continuous_scale=["#3D0000",NF_RED],
text="count", title="Volume by Genre")
fig_gc.update_traces(texttemplate="%{text:,}", textposition="outside")
fig_gc.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_gc, 420)
st.plotly_chart(fig_gc, use_container_width=True)
with col_g2:
if "genres" in movies_f.columns and "vote_average" in movies_f.columns and not movies_f.empty:
gr = (movies_f.explode("genres").groupby("genres")["vote_average"]
.agg(["mean","count"]).reset_index()
.rename(columns={"genres":"genre","mean":"avg_rating"})
.query("count >= 10").sort_values("avg_rating",ascending=False).head(15))
fig_gr = px.bar(gr, x="avg_rating", y="genre", orientation="h",
color="avg_rating",
color_continuous_scale=[NF_RED,GOLD,GREEN],
text="avg_rating", title="Quality by Genre")
fig_gr.update_traces(texttemplate="%{text:.2f}", textposition="outside")
fig_gr.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_gr, 420)
st.plotly_chart(fig_gr, use_container_width=True)
# Genre × Decade Heatmap
st.markdown("---")
sec("🗓️ GENRE POPULARITY BY DECADE")
if "genres" in movies_f.columns and "release_year" in movies_f.columns and not movies_f.empty:
movies_f_dec = movies_f.copy()
movies_f_dec["decade_str"] = (movies_f_dec["release_year"] // 10 * 10).astype("Int64").astype(str) + "s"
hmap = (movies_f_dec.explode("genres")
.groupby(["decade_str","genres"]).size().reset_index(name="count"))
if not hmap.empty:
top_genres_hmap = hmap.groupby("genres")["count"].sum().nlargest(12).index.tolist()
hmap_top = hmap[hmap["genres"].isin(top_genres_hmap)]
pivot = hmap_top.pivot(index="genres", columns="decade_str", values="count").fillna(0)
fig_hm = px.imshow(pivot, color_continuous_scale=["#0a0a0a",hex_rgba(NF_RED,0.4),NF_RED],
labels={"color":"Titles"}, aspect="auto")
fig_hm.update_layout(xaxis_title="Decade", yaxis_title="")
apply_theme(fig_hm, 380)
st.plotly_chart(fig_hm, use_container_width=True)
# ══════════════════════════════════════════════════════════════════════════════
# TAB 5: MOVIES ANALYSIS
# ══════════════════════════════════════════════════════════════════════════════
with tab_movies:
sec("🎬 MOVIES DEEP-DIVE")
tab_m1, tab_m2, tab_m3, tab_m4 = st.tabs(["🏆 Top Rated","💰 Box Office","🎭 Multi-Score","🏢 Franchises"])
with tab_m1:
col_m1, col_m2 = st.columns([3,1], gap="large")
with col_m1:
if "vote_average" in movies_f.columns and not movies_f.empty:
# ── FIX 2: guard optional OMDB columns ───────────────────────
_m1_want = ["title","vote_average","vote_count","release_year","imdb_rating","has_awards"]
_m1_cols = safe_cols(movies_f, _m1_want)
vc_series = movies_f["vote_count"] if "vote_count" in movies_f.columns else pd.Series([999]*len(movies_f), index=movies_f.index)
top_r = (movies_f[vc_series >= 200]
.nlargest(12, "vote_average")
[_m1_cols]
.reset_index(drop=True))
fig_tr = px.bar(top_r, x="vote_average", y="title", orientation="h",
color="vote_average",
color_continuous_scale=["#6C1F1F",NF_RED,"#FF8C8C"],
text="vote_average",
custom_data=safe_cols(top_r, ["vote_count","release_year","imdb_rating"]))
fig_tr.update_traces(
texttemplate="%{text:.2f}", textposition="outside",
hovertemplate="%{y}
Rating: %{x:.2f}")
fig_tr.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_tr, 450)
st.plotly_chart(fig_tr, use_container_width=True)
with col_m2:
st.markdown("#### 🖼️ Top Picks")
vc_series2 = movies_f["vote_count"] if "vote_count" in movies_f.columns else pd.Series([999]*len(movies_f), index=movies_f.index)
top_imgs = (movies_f[vc_series2 >= 200].nlargest(4,"vote_average")
if "vote_average" in movies_f.columns else movies_f.head(4))
for _, row in top_imgs.iterrows():
p = row.get("poster_url","")
if p and p != "None":
st.image(p, width=150, caption=str(row.get("title",""))[:30])
with tab_m2:
if all(c in movies_f.columns for c in ["budget_usd","revenue_usd","title"]):
sc_bo = movies_f[(movies_f["budget_usd"]>1e6)&(movies_f["revenue_usd"]>1e6)].copy()
if not sc_bo.empty:
col_b1, col_b2 = st.columns([3,1], gap="large")
with col_b1:
fig_bo = px.scatter(
sc_bo, x="budget_usd", y="revenue_usd",
color="roi" if "roi" in sc_bo.columns else "vote_average",
size="vote_count" if "vote_count" in sc_bo.columns else None,
hover_name="title",
color_continuous_scale=["#6C1F1F",NF_RED,GOLD,GREEN],
log_x=True, log_y=True,
labels={"budget_usd":"Budget (USD)","revenue_usd":"Revenue (USD)"},
)
mx = max(sc_bo["budget_usd"].max(), sc_bo["revenue_usd"].max())
fig_bo.add_shape(type="line",x0=1e6,y0=1e6,x1=mx,y1=mx,
line=dict(color="#444",dash="dash",width=1))
apply_theme(fig_bo, 450)
st.plotly_chart(fig_bo, use_container_width=True)
with col_b2:
st.markdown("#### 💰 Box Office")
top_rev = sc_bo.nlargest(5,"revenue_usd")
for _, r in top_rev.iterrows():
p = r.get("poster_url","")
col_pi, col_ti = st.columns([1,2])
with col_pi:
if p and p!="None": st.image(p, width=60)
with col_ti:
st.markdown(f"**{str(r.get('title',''))[:20]}**")
st.caption(f"${r.get('revenue_usd',0)/1e9:.1f}B")
# Budget tier breakdown
if "budget_tier" in movies_f.columns:
st.markdown("---")
bt = movies_f["budget_tier"].value_counts().reset_index()
bt.columns = ["tier","count"]
col_bt1, col_bt2 = st.columns(2)
with col_bt1:
fig_bt = px.pie(bt, names="tier", values="count", hole=0.5,
color_discrete_sequence=CHART_COLORS, title="Movies by Budget Tier")
fig_bt.update_traces(textinfo="percent+label")
apply_theme(fig_bt, 300)
st.plotly_chart(fig_bt, use_container_width=True)
with col_bt2:
if "roi" in movies_f.columns:
roi_by_tier = (movies_f.groupby("budget_tier")["roi"]
.mean().reset_index().rename(columns={"roi":"avg_roi"}))
fig_rt = px.bar(roi_by_tier, x="budget_tier", y="avg_roi",
color="avg_roi",
color_continuous_scale=["#3D0000",NF_RED,GREEN],
text="avg_roi", title="Avg ROI by Budget Tier")
fig_rt.update_traces(texttemplate="%{text:.1f}x", textposition="outside")
fig_rt.update_layout(coloraxis_showscale=False)
apply_theme(fig_rt, 300)
st.plotly_chart(fig_rt, use_container_width=True)
with tab_m3:
st.markdown("**Multi-Score Comparison: TMDB vs IMDb vs RT vs Metacritic**")
if all(c in movies_f.columns for c in ["vote_average","imdb_rating"]):
multi_df = movies_f.dropna(subset=["vote_average","imdb_rating"]).copy()
if not multi_df.empty:
multi_sample = multi_df.nlargest(50,"vote_count") if "vote_count" in multi_df.columns else multi_df.head(50)
fig_ms = go.Figure()
x_titles = multi_sample["title"].str[:25].tolist()
for col_name, color, label in [
("vote_average", TEAL, "TMDB (×10)"),
("imdb_rating", GOLD, "IMDb (×10)"),
]:
if col_name in multi_sample.columns:
fig_ms.add_trace(go.Bar(
name=label, x=x_titles,
y=multi_sample[col_name]*10,
marker_color=color, opacity=0.8
))
if "rt_score" in multi_sample.columns:
fig_ms.add_trace(go.Bar(
name="RT Score", x=x_titles,
y=multi_sample["rt_score"].fillna(0),
marker_color=NF_RED, opacity=0.8
))
fig_ms.update_layout(barmode="group", xaxis_tickangle=-45,
xaxis_title="", yaxis_title="Score (normalized to 100)")
apply_theme(fig_ms, 450)
st.plotly_chart(fig_ms, use_container_width=True)
else:
st.info("Multi-score comparison ต้องการข้อมูล OMDB — รัน fetch_omdb_enrichment ใน pipeline ก่อน")
with tab_m4:
if not franchises.empty:
st.markdown("**🏢 Top Franchises by Total Revenue**")
col_f1, col_f2 = st.columns([2,1], gap="large")
with col_f1:
top_fr = franchises.nlargest(15,"total_revenue_usd") if "total_revenue_usd" in franchises.columns else franchises.head(15)
fig_fr = px.bar(top_fr, x="total_revenue_usd", y="collection_name",
orientation="h", color="movie_count",
color_continuous_scale=["#1a0010",PINK],
text="total_revenue_usd",
labels={"total_revenue_usd":"Total Revenue (USD)","collection_name":""})
fig_fr.update_traces(texttemplate="$%{text:,.0f}", textposition="outside")
fig_fr.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_fr, 450)
st.plotly_chart(fig_fr, use_container_width=True)
with col_f2:
if "franchise_roi" in franchises.columns:
top_roi_fr = franchises.dropna(subset=["franchise_roi"]).nlargest(8,"franchise_roi")
fig_froi = px.bar(top_roi_fr, x="franchise_roi", y="collection_name",
orientation="h", color="franchise_roi",
color_continuous_scale=[NF_RED,GOLD,GREEN],
text="franchise_roi", title="Best ROI Franchises")
fig_froi.update_traces(texttemplate="%{text:.1f}x", textposition="outside")
fig_froi.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_froi, 450)
st.plotly_chart(fig_froi, use_container_width=True)
else:
st.info("ไม่มีข้อมูล Franchise — รัน Pipeline ใหม่เพื่อสร้าง franchises.parquet")
# ══════════════════════════════════════════════════════════════════════════════
# TAB 6: TV SHOWS
# ══════════════════════════════════════════════════════════════════════════════
with tab_tv:
sec("📺 TV SHOWS DEEP-DIVE")
tab_tv1, tab_tv2, tab_tv3 = st.tabs(["🏆 Top Rated","📊 Structure","🌱 Renewal Signal"])
with tab_tv1:
col_tv1, col_tv2 = st.columns([3,1], gap="large")
with col_tv1:
if "vote_average" in tv_f.columns and "name" in tv_f.columns and not tv_f.empty:
# ── FIX 3: guard optional OMDB columns in TV ─────────────────
_tv1_want = ["name","vote_average","vote_count","number_of_seasons","imdb_rating"]
_tv1_cols = safe_cols(tv_f, _tv1_want)
vc_tv = tv_f["vote_count"] if "vote_count" in tv_f.columns else pd.Series([999]*len(tv_f), index=tv_f.index)
top_tv_r = (tv_f[vc_tv >= 100]
.nlargest(12, "vote_average")
[_tv1_cols]
.reset_index(drop=True))
fig_tvr = px.bar(top_tv_r, x="vote_average", y="name", orientation="h",
color="vote_average",
color_continuous_scale=["#1a0040",PURPLE,"#A29BFE"],
text="vote_average")
fig_tvr.update_traces(
texttemplate="%{text:.2f}", textposition="outside",
hovertemplate="%{y}
Rating: %{x:.2f}")
fig_tvr.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_tvr, 450)
st.plotly_chart(fig_tvr, use_container_width=True)
with col_tv2:
st.markdown("#### 🖼️ Top Picks")
vc_tv2 = tv_f["vote_count"] if "vote_count" in tv_f.columns else pd.Series([999]*len(tv_f), index=tv_f.index)
tv_top_imgs = (tv_f[vc_tv2 >= 100].nlargest(4,"vote_average")
if "vote_average" in tv_f.columns else tv_f.head(4))
for _, row in tv_top_imgs.iterrows():
p = row.get("poster_url","")
if p and p != "None":
st.image(p, width=150, caption=str(row.get("name",""))[:25])
with tab_tv2:
col_ts1, col_ts2, col_ts3 = st.columns(3, gap="large")
with col_ts1:
if "status" in tv_f.columns and not tv_f.empty:
sc = tv_f["status"].value_counts().reset_index()
sc.columns = ["status","count"]
colors_map = {"Returning Series":GREEN,"Ended":NF_RED,"Canceled":"#E17055",
"In Production":TEAL,"Planned":PURPLE}
fig_st = px.pie(sc, names="status", values="count", hole=0.55,
color="status", color_discrete_map=colors_map, title="TV Status")
fig_st.update_traces(textinfo="percent+label", textfont_size=10)
apply_theme(fig_st, 320)
st.plotly_chart(fig_st, use_container_width=True)
with col_ts2:
if "number_of_seasons" in tv_f.columns and not tv_f.empty:
sd = (tv_f["number_of_seasons"].dropna().astype(int)
.value_counts().sort_index().reset_index())
sd.columns = ["seasons","count"]
sd = sd[sd["seasons"] <= 20]
fig_sd = px.bar(sd, x="seasons", y="count",
color="count", color_continuous_scale=["#1a0040",PURPLE],
text="count", title="Seasons Distribution")
fig_sd.update_traces(texttemplate="%{text}", textposition="outside")
fig_sd.update_layout(coloraxis_showscale=False, bargap=0.3)
apply_theme(fig_sd, 320)
st.plotly_chart(fig_sd, use_container_width=True)
with col_ts3:
if "episode_format" in tv_f.columns and not tv_f.empty:
ef = tv_f["episode_format"].value_counts().reset_index()
ef.columns = ["format","count"]
fig_ef = px.pie(ef, names="format", values="count", hole=0.5,
color_discrete_sequence=CHART_COLORS, title="Episode Format")
fig_ef.update_traces(textinfo="percent+label", textfont_size=10)
apply_theme(fig_ef, 320)
st.plotly_chart(fig_ef, use_container_width=True)
# Total content hours
if "total_content_hours" in tv_f.columns and not tv_f.empty:
st.markdown("---")
st.markdown("**Total Content Hours (Top 20 shows)**")
top_hours = tv_f.nlargest(20,"total_content_hours")[safe_cols(tv_f,["name","total_content_hours","number_of_seasons"])].dropna()
fig_hrs = px.bar(top_hours, x="total_content_hours", y="name", orientation="h",
color="number_of_seasons" if "number_of_seasons" in top_hours.columns else "total_content_hours",
color_continuous_scale=["#1a0040",PURPLE],
text="total_content_hours",
labels={"total_content_hours":"Total Watch Hours","name":""})
fig_hrs.update_traces(texttemplate="%{text:.0f}h", textposition="outside")
fig_hrs.update_layout(yaxis={"categoryorder":"total ascending"})
apply_theme(fig_hrs, 400)
st.plotly_chart(fig_hrs, use_container_width=True)
# TV Seasons detail
if not tv_seasons.empty and "season_number" in tv_seasons.columns:
st.markdown("---")
sec("🗓️ SEASON-LEVEL ANALYSIS")
if "name" in tv_seasons.columns:
sel_show = st.selectbox("เลือก TV Show", tv_seasons["name"].dropna().unique().tolist()[:100])
show_seasons = tv_seasons[tv_seasons["name"]==sel_show].sort_values("season_number")
if not show_seasons.empty:
col_ss1, col_ss2 = st.columns(2)
with col_ss1:
if "episode_count" in show_seasons.columns:
fig_ssn = px.bar(show_seasons, x="season_number", y="episode_count",
color="vote_average" if "vote_average" in show_seasons.columns else "season_number",
color_continuous_scale=[PURPLE,"#A29BFE"],
text="episode_count",
labels={"season_number":"Season","episode_count":"Episodes"})
fig_ssn.update_traces(texttemplate="%{text}", textposition="outside")
fig_ssn.update_layout(coloraxis_showscale=False)
apply_theme(fig_ssn, 280)
st.plotly_chart(fig_ssn, use_container_width=True)
with col_ss2:
disp_cols = safe_cols(show_seasons, ["season_number","season_name","air_date","episode_count","vote_average","season_position"])
st.dataframe(show_seasons[disp_cols].reset_index(drop=True),
use_container_width=True, height=280)
with tab_tv3:
if "renewal_signal" in tv_f.columns and not tv_f.empty:
rs = tv_f["renewal_signal"].value_counts().reset_index()
rs.columns = ["signal","count"]
colors_rs = {
"Renewed/Ongoing":GREEN,"Strong Candidate":TEAL,
"Possible":GOLD,"Unlikely / Ended":NF_RED
}
col_rs1, col_rs2 = st.columns([2,1], gap="large")
with col_rs1:
fig_rs = px.bar(rs, x="count", y="signal", orientation="h",
color="signal", color_discrete_map=colors_rs,
text="count", title="TV Shows Renewal Likelihood")
fig_rs.update_traces(texttemplate="%{text:,}", textposition="outside")
fig_rs.update_layout(yaxis={"categoryorder":"total ascending"}, showlegend=False)
apply_theme(fig_rs, 320)
st.plotly_chart(fig_rs, use_container_width=True)
st.markdown("**📺 Strong Candidates — Shows Likely to Return**")
strong = tv_f[tv_f["renewal_signal"]=="Strong Candidate"].nlargest(12,"vote_average") if "vote_average" in tv_f.columns else tv_f[tv_f["renewal_signal"]=="Strong Candidate"].head(12)
if not strong.empty:
poster_gallery(strong, title_col="name", year_col="first_air_year",
rating_col="vote_average", poster_col="poster_url", max_cards=10)
with col_rs2:
for signal, style in [("Renewed/Ongoing","green"),("Strong Candidate","teal"),
("Possible","gold"),("Unlikely / Ended","red")]:
cnt = tv_f[tv_f["renewal_signal"]==signal].shape[0]
insight(f"{signal}: {cnt:,} shows", style)
else:
st.info("ไม่มีข้อมูล renewal_signal — ต้องรัน PySpark pipeline ใหม่")
# ══════════════════════════════════════════════════════════════════════════════
# TAB 7: TALENT
# ══════════════════════════════════════════════════════════════════════════════
with tab_talent:
sec("🌟 TALENT & CREDITS")
if not top_talent.empty:
col_tt1, col_tt2 = st.columns([3,1], gap="large")
cast_talent = top_talent[top_talent["role"]=="cast"].copy() if "role" in top_talent.columns else top_talent.copy()
crew_talent = top_talent[top_talent["role"]=="crew"].copy() if "role" in top_talent.columns else pd.DataFrame()
with col_tt1:
st.markdown("**🎭 Most Impactful Cast — Avg Engagement Score**")
if not cast_talent.empty and "avg_content_engagement" in cast_talent.columns:
top_cast_t = cast_talent.nlargest(15,"avg_content_engagement")
color_col = "total_imdb_votes_across_titles" if "total_imdb_votes_across_titles" in top_cast_t.columns else "avg_content_engagement"
fig_ct = px.bar(top_cast_t, x="avg_content_engagement", y="name",
orientation="h", color=color_col,
color_continuous_scale=["#001433",TEAL],
text="avg_content_engagement",
labels={"avg_content_engagement":"Avg Engagement","name":""})
fig_ct.update_traces(texttemplate="%{text:.2f}", textposition="outside")
fig_ct.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_ct, 460)
st.plotly_chart(fig_ct, use_container_width=True)
with col_tt2:
st.markdown("**🖼️ Top Talent Profiles**")
top5_cast = cast_talent.nlargest(8,"avg_content_engagement") if not cast_talent.empty and "avg_content_engagement" in cast_talent.columns else cast_talent.head(8)
cards_html = ''
for _, row in top5_cast.iterrows():
profile = row.get("profile_url","")
if not profile or profile == "None":
profile = "https://via.placeholder.com/72x72/181818/555?text=👤"
name = str(row.get("name",""))[:20]
eng = row.get("avg_content_engagement",0)
tc = row.get("title_count",0)
cards_html += f"""
{name}
Score: {eng:.1f} · {tc} titles
"""
cards_html += "
"
st.markdown(cards_html, unsafe_allow_html=True)
st.markdown("---")
if not crew_talent.empty:
st.markdown("**🎬 Top Directors & Producers**")
col_cr1, col_cr2 = st.columns([2,2], gap="large")
with col_cr1:
top_crew = crew_talent.nlargest(12,"avg_content_engagement") if "avg_content_engagement" in crew_talent.columns else crew_talent.head(12)
fig_crew = px.bar(top_crew, x="avg_content_engagement", y="name",
orientation="h", color="title_count" if "title_count" in top_crew.columns else "avg_content_engagement",
color_continuous_scale=["#1a0a20",PURPLE],
text="avg_content_engagement",
labels={"avg_content_engagement":"Avg Engagement","name":""})
fig_crew.update_traces(texttemplate="%{text:.2f}", textposition="outside")
fig_crew.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_crew, 400)
st.plotly_chart(fig_crew, use_container_width=True)
with col_cr2:
if "gender" in cast_talent.columns and not cast_talent.empty:
gen_dist = cast_talent["gender"].value_counts().reset_index()
gen_dist.columns = ["gender","count"]
fig_gen = px.pie(gen_dist, names="gender", values="count",
hole=0.55, title="Cast Gender Distribution",
color="gender",
color_discrete_map={"Female":TEAL,"Male":PURPLE,"Unknown":NF_BORDER})
fig_gen.update_traces(textinfo="percent+label", textfont_size=12)
apply_theme(fig_gen, 380)
st.plotly_chart(fig_gen, use_container_width=True)
else:
# Fallback to credits
col_c1, col_c2 = st.columns([3,1], gap="large")
with col_c1:
if not credits.empty and "role" in credits.columns:
top_cast = (credits[credits["role"]=="cast"]
.groupby("name").size().reset_index(name="appearances")
.nlargest(15,"appearances"))
fig_cast = px.bar(top_cast, x="appearances", y="name", orientation="h",
color="appearances", color_continuous_scale=["#001433",TEAL],
text="appearances",
labels={"appearances":"Appearances","name":""})
fig_cast.update_traces(texttemplate="%{text}", textposition="outside")
fig_cast.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_cast, 450)
st.plotly_chart(fig_cast, use_container_width=True)
with col_c2:
if "gender" in credits.columns and not credits.empty:
gd = credits[credits["role"]=="cast"]["gender"].value_counts().reset_index()
gd.columns = ["gender","count"]
fig_g = px.pie(gd, names="gender", values="count", hole=0.55,
color="gender",
color_discrete_map={"Female":TEAL,"Male":PURPLE,"Unknown":"#333"},
title="Gender Distribution")
fig_g.update_traces(textinfo="percent+label")
apply_theme(fig_g, 300)
st.plotly_chart(fig_g, use_container_width=True)
# Keywords treemap
st.markdown("---")
sec("🔑 TRENDING THEMES & KEYWORDS")
if not keywords.empty and "keyword" in keywords.columns:
col_kw1, col_kw2 = st.columns([3,1], gap="large")
with col_kw1:
top_kw = keywords.groupby("keyword").size().reset_index(name="count").nlargest(30,"count")
fig_kw = px.treemap(top_kw, path=["keyword"], values="count",
color="count", color_continuous_scale=["#200000","#6C1F1F",NF_RED],
title="Top 30 Content Themes")
fig_kw.update_traces(textfont=dict(size=12,family="DM Sans"))
apply_theme(fig_kw, 400)
st.plotly_chart(fig_kw, use_container_width=True)
with col_kw2:
top5_kw = top_kw.head(5)
for _, r in top5_kw.iterrows():
insight(f"🔑 '{r['keyword']}' — {r['count']:,} titles", "red")
# ══════════════════════════════════════════════════════════════════════════════
# TAB 8: YEAR-OVER-YEAR TRENDS
# ══════════════════════════════════════════════════════════════════════════════
with tab_trends:
sec("📈 YEAR-OVER-YEAR TRENDS")
if not yoy_trend.empty and "year" in yoy_trend.columns:
yoy_m = yoy_trend[yoy_trend["media_type"]=="movie"].sort_values("year") if "media_type" in yoy_trend.columns else yoy_trend.sort_values("year")
yoy_tv = yoy_trend[yoy_trend["media_type"]=="tv"].sort_values("year") if "media_type" in yoy_trend.columns else pd.DataFrame()
col_yr1, col_yr2 = st.columns(2, gap="large")
with col_yr1:
st.markdown("**Avg Rating Over Time**")
fig_yr1 = go.Figure()
if not yoy_m.empty and "avg_rating" in yoy_m.columns:
fig_yr1.add_trace(go.Scatter(x=yoy_m["year"], y=yoy_m["avg_rating"],
name="Movies", mode="lines+markers",
line=dict(color=NF_RED,width=2.5), marker_size=5))
if not yoy_tv.empty and "avg_rating" in yoy_tv.columns:
fig_yr1.add_trace(go.Scatter(x=yoy_tv["year"], y=yoy_tv["avg_rating"],
name="TV Shows", mode="lines+markers",
line=dict(color=PURPLE,width=2.5), marker_size=5))
fig_yr1.update_layout(xaxis_title="Year", yaxis_title="Avg Rating")
apply_theme(fig_yr1, 320)
st.plotly_chart(fig_yr1, use_container_width=True)
with col_yr2:
st.markdown("**Avg Engagement Over Time**")
fig_yr2 = go.Figure()
if not yoy_m.empty and "avg_engagement" in yoy_m.columns:
fig_yr2.add_trace(go.Scatter(x=yoy_m["year"], y=yoy_m["avg_engagement"],
name="Movies", mode="lines", fill="tozeroy",
line=dict(color=NF_RED,width=2),
fillcolor=hex_rgba(NF_RED,0.12)))
if not yoy_tv.empty and "avg_engagement" in yoy_tv.columns:
fig_yr2.add_trace(go.Scatter(x=yoy_tv["year"], y=yoy_tv["avg_engagement"],
name="TV Shows", mode="lines", fill="tozeroy",
line=dict(color=PURPLE,width=2),
fillcolor=hex_rgba(PURPLE,0.12)))
fig_yr2.update_layout(xaxis_title="Year", yaxis_title="Avg Engagement Score")
apply_theme(fig_yr2, 320)
st.plotly_chart(fig_yr2, use_container_width=True)
st.markdown("---")
col_yr3, col_yr4 = st.columns(2, gap="large")
with col_yr3:
if "trending_titles" in yoy_m.columns and not yoy_m.empty:
st.markdown("**Trending Titles by Year**")
fig_tr = px.bar(yoy_m, x="year", y="trending_titles",
color="trending_titles",
color_continuous_scale=[hex_rgba(TEAL,0.3),TEAL],
labels={"trending_titles":"Trending Titles","year":"Year"})
fig_tr.update_layout(coloraxis_showscale=False)
apply_theme(fig_tr, 280)
st.plotly_chart(fig_tr, use_container_width=True)
with col_yr4:
if "awarded_titles" in yoy_m.columns and not yoy_m.empty:
st.markdown("**Award Winners by Year**")
fig_aw = px.bar(yoy_m, x="year", y="awarded_titles",
color="awarded_titles",
color_continuous_scale=[hex_rgba(GOLD,0.3),GOLD],
labels={"awarded_titles":"Award Winners","year":"Year"})
fig_aw.update_layout(coloraxis_showscale=False)
apply_theme(fig_aw, 280)
st.plotly_chart(fig_aw, use_container_width=True)
if "avg_imdb_votes" in yoy_m.columns and not yoy_m.empty:
st.markdown("---")
st.markdown("**Avg IMDb Votes (Viewership Proxy) Over Time**")
fig_iv = go.Figure()
fig_iv.add_trace(go.Scatter(x=yoy_m["year"], y=yoy_m["avg_imdb_votes"],
name="Movies", mode="lines+markers",
line=dict(color=GOLD,width=2.5), marker_size=5,
fill="tozeroy", fillcolor=hex_rgba(GOLD,0.1)))
if not yoy_tv.empty and "avg_imdb_votes" in yoy_tv.columns:
fig_iv.add_trace(go.Scatter(x=yoy_tv["year"], y=yoy_tv["avg_imdb_votes"],
name="TV Shows", mode="lines+markers",
line=dict(color=PURPLE,width=2.5), marker_size=5))
fig_iv.update_layout(xaxis_title="Year", yaxis_title="Avg IMDb Votes")
apply_theme(fig_iv, 320)
st.plotly_chart(fig_iv, use_container_width=True)
else:
st.info("ไม่มีข้อมูล yoy_trend.parquet — แสดงจากข้อมูล filtered แทน")
if "release_year" in movies_f.columns and not movies_f.empty:
yr_data = movies_f.groupby("release_year").agg(
count=("title","count"),
avg_rating=("vote_average","mean")
).reset_index()
fig_fb = make_subplots(specs=[[{"secondary_y":True}]])
fig_fb.add_trace(go.Bar(x=yr_data["release_year"], y=yr_data["count"],
name="Count", marker_color=hex_rgba(NF_RED,0.5)), secondary_y=False)
fig_fb.add_trace(go.Scatter(x=yr_data["release_year"], y=yr_data["avg_rating"],
name="Avg Rating", line=dict(color=GOLD,width=2.5)), secondary_y=True)
fig_fb.update_layout(**PLOTLY_BASE, height=380)
st.plotly_chart(fig_fb, use_container_width=True)
# Language Summary
st.markdown("---")
sec("🌍 LANGUAGE & REGION ANALYTICS")
if not lang_summary.empty:
col_ls1, col_ls2 = st.columns(2, gap="large")
with col_ls1:
if all(c in lang_summary.columns for c in ["original_language","title_count"]):
top_lang = lang_summary.nlargest(15,"title_count").copy()
lang_map = {"en":"English","ja":"Japanese","ko":"Korean","fr":"French","es":"Spanish",
"de":"German","it":"Italian","pt":"Portuguese","zh":"Chinese","hi":"Hindi",
"ru":"Russian","th":"Thai","ar":"Arabic","nl":"Dutch","sv":"Swedish"}
top_lang["lang_name"] = top_lang["original_language"].map(lang_map).fillna(top_lang["original_language"])
fig_ls = px.bar(top_lang, x="title_count", y="lang_name", orientation="h",
color="avg_popularity" if "avg_popularity" in top_lang.columns else "title_count",
color_continuous_scale=["#001a33",TEAL],
text="title_count",
labels={"title_count":"Titles","lang_name":""})
fig_ls.update_traces(texttemplate="%{text:,}", textposition="outside")
fig_ls.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_ls, 380)
st.plotly_chart(fig_ls, use_container_width=True)
with col_ls2:
if "language_group" in lang_summary.columns:
lg_grp = lang_summary.groupby("language_group")["title_count"].sum().reset_index()
fig_lg = px.pie(lg_grp, names="language_group", values="title_count",
hole=0.55, color_discrete_sequence=CHART_COLORS,
title="Content by Language Group")
fig_lg.update_traces(textinfo="percent+label", textfont_size=11)
apply_theme(fig_lg, 380)
st.plotly_chart(fig_lg, use_container_width=True)
else:
if "original_language" in movies_f.columns and not movies_f.empty:
lang_cnt = movies_f["original_language"].value_counts().head(12).reset_index()
lang_cnt.columns = ["language","count"]
fig_lf = px.bar(lang_cnt, x="count", y="language", orientation="h",
color="count", color_continuous_scale=["#001a33",TEAL],
text="count")
fig_lf.update_traces(texttemplate="%{text:,}", textposition="outside")
fig_lf.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False)
apply_theme(fig_lf, 380)
st.plotly_chart(fig_lf, use_container_width=True)
# ─────────────────────────────────────────────────────────────────────────────
# RAW DATA EXPLORER
# ─────────────────────────────────────────────────────────────────────────────
st.markdown("---")
with st.expander("🗃️ Raw Data Explorer", expanded=False):
tabs_raw = st.tabs(["🎬 Movies","📺 TV","🎭 Credits","🔑 Keywords",
"📊 Content Perf","🎭 Genre Perf","🌟 Talent","📈 YoY"])
data_map = [
movies_f, tv_f, credits, keywords,
content_perf, genre_perf, top_talent, yoy_trend,
]
for i, df in enumerate(data_map):
with tabs_raw[i]:
st.caption(f"{len(df):,} records")
if not df.empty:
st.dataframe(df.head(200), use_container_width=True, height=320)
# ─────────────────────────────────────────────────────────────────────────────
# FOOTER
# ─────────────────────────────────────────────────────────────────────────────
st.markdown(f"""
NETFLIX ANALYTICS 2.0 · TMDB API + OMDB API · AIRFLOW → PYSPARK → HUGGINGFACE
New Tables: content_performance · genre_performance · top_talent · yoy_trend · tv_seasons · language_summary
""", unsafe_allow_html=True)