import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import numpy as np # ───────────────────────────────────────────────────────────────────────────── # PAGE CONFIG # ───────────────────────────────────────────────────────────────────────────── st.set_page_config( page_title="Netflix Analytics 2.0", page_icon="🎬", layout="wide", initial_sidebar_state="expanded", ) # ───────────────────────────────────────────────────────────────────────────── # THEME CONSTANTS # ───────────────────────────────────────────────────────────────────────────── NF_RED = "#E50914" NF_DARK = "#0a0a0a" NF_CARD = "#181818" NF_GRAY = "#222222" NF_BORDER = "#2f2f2f" PURPLE = "#6C5CE7" TEAL = "#00B4D8" GOLD = "#F39C12" GREEN = "#00B894" PINK = "#E84393" TEXT_PRI = "#FFFFFF" TEXT_MUT = "#9e9e9e" TEXT_DIM = "#555555" CHART_COLORS = [NF_RED, PURPLE, TEAL, GOLD, GREEN, PINK, "#A29BFE", "#55EFC4", "#FD79A8"] def hex_rgba(h, a=0.15): h = h.lstrip("#") r, g, b = int(h[0:2],16), int(h[2:4],16), int(h[4:6],16) return f"rgba({r},{g},{b},{a})" PLOTLY_BASE = dict( paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", font=dict(family="DM Sans, sans-serif", color=TEXT_PRI, size=12), xaxis=dict(gridcolor=NF_BORDER, linecolor=NF_BORDER, tickcolor=TEXT_MUT), yaxis=dict(gridcolor=NF_BORDER, linecolor=NF_BORDER, tickcolor=TEXT_MUT), colorway=CHART_COLORS, legend=dict(bgcolor="rgba(0,0,0,0)", font=dict(color=TEXT_PRI)), margin=dict(l=10, r=10, t=40, b=10), title=dict(font=dict(size=14, color=TEXT_PRI)), ) # ───────────────────────────────────────────────────────────────────────────── # CSS # ───────────────────────────────────────────────────────────────────────────── st.markdown(f""" """, unsafe_allow_html=True) # ───────────────────────────────────────────────────────────────────────────── # HELPERS # ───────────────────────────────────────────────────────────────────────────── def apply_theme(fig, h=380): fig.update_layout(**PLOTLY_BASE, height=h) return fig def sec(label): st.markdown(f'
{label}
', unsafe_allow_html=True) def insight(text, style=""): st.markdown(f'
{text}
', unsafe_allow_html=True) def kpi(icon, val, lbl, delta=None, dt="neu", accent=None): style = f"--accent: linear-gradient(90deg,{accent},{accent}66);" if accent else "" dhtml = f'
{delta}
' if delta else "" st.markdown(f"""
{icon}
{val}
{lbl}
{dhtml}
""", unsafe_allow_html=True) def score_pills(tmdb=None, imdb=None, rt=None, mc=None): pills = "" if tmdb is not None and not pd.isna(tmdb): pills += f'⭐ {tmdb:.1f}' if imdb is not None and not pd.isna(imdb): pills += f'🎬 IMDb {imdb:.1f}' if rt is not None and not pd.isna(rt): pills += f'🍅 {rt}%' if mc is not None and not pd.isna(mc): pills += f'🎯 MC {mc}' st.markdown(pills, unsafe_allow_html=True) def poster_gallery(df, title_col="title", year_col="release_year", rating_col="vote_average", poster_col="poster_url", badge_col=None, badge_label="", max_cards=20, extra_col=None, extra_label=""): """Render a horizontal poster card gallery.""" items = df.head(max_cards).to_dict("records") cards_html = '
' placeholder = "https://via.placeholder.com/150x220/181818/555555?text=No+Image" for row in items: poster = row.get(poster_col) or "" if not poster or poster == "None": poster = placeholder title = str(row.get(title_col, ""))[:40] year = row.get(year_col, "") rating = row.get(rating_col, "") rating_str = f"⭐ {rating:.1f}" if isinstance(rating, (int,float)) and not pd.isna(rating) else "" badge_html = "" if badge_col and row.get(badge_col) is not None: bval = row[badge_col] if isinstance(bval, float): bval = f"{bval:.1f}" badge_html = f'{badge_label} #{bval}' extra_html = "" if extra_col and row.get(extra_col): extra_html = f'{extra_label} {row[extra_col]}' cards_html += f"""
{title}
{title}
{year}   {rating_str}
{badge_html}{extra_html}
""" cards_html += "
" st.markdown(cards_html, unsafe_allow_html=True) # ───────────────────────────────────────────────────────────────────────────── # HELPER: safe column selector # ───────────────────────────────────────────────────────────────────────────── def safe_cols(df, cols): """Return only columns that actually exist in df.""" return [c for c in cols if c in df.columns] # ───────────────────────────────────────────────────────────────────────────── # DATA LOADING # ───────────────────────────────────────────────────────────────────────────── BASE = "hf://datasets/ihhereanth/netflix_dataset/" @st.cache_data(ttl=3600) def load_data(): # ── FIX 1: safe_load with schema-mismatch fallback ─────────────────────── def safe_load(filename): try: return pd.read_parquet(BASE + filename) except Exception as e: # Fallback: try pyarrow directly with permissive cast try: import pyarrow.parquet as pq table = pq.read_table(BASE + filename) return table.to_pandas(strings_to_categorical=False) except Exception as e2: st.warning(f"⚠️ ไม่พบ {filename}: {e2}") return pd.DataFrame() def to_num(df, cols): for c in cols: if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce") return df movies = safe_load("movies.parquet") tv = safe_load("tv_shows.parquet") credits = safe_load("credits.parquet") keywords= safe_load("keywords.parquet") m_num = ["vote_count","vote_average","runtime_min","budget_usd","revenue_usd","popularity", "release_year","release_month","roi","imdb_rating","imdb_votes","rt_score", "metacritic_score","audience_engagement_score","profit_usd","omdb_box_office", "best_weekly_rank","critic_audience_gap","release_decade"] tv_num = ["vote_count","vote_average","popularity","number_of_seasons","number_of_episodes", "first_air_year","last_air_year","imdb_rating","imdb_votes","rt_score", "metacritic_score","audience_engagement_score","votes_per_episode", "total_content_hours","years_on_air","best_weekly_rank","first_air_decade"] movies = to_num(movies, m_num) tv = to_num(tv, tv_num) # Derived for old pipeline compatibility if "release_year" in movies.columns and "decade" not in movies.columns: movies["decade"] = (movies["release_year"] // 10 * 10).astype("Int64").astype(str) + "s" if "first_air_year" in tv.columns and "decade" not in tv.columns: tv["decade"] = (tv["first_air_year"] // 10 * 10).astype("Int64").astype(str) + "s" if "gender" in credits.columns: credits["gender"] = credits["gender"].map({0:"Unknown",1:"Female",2:"Male"}).fillna("Unknown") # New analytics tables content_perf = safe_load("content_performance.parquet") genre_perf = safe_load("genre_performance.parquet") lang_summary = safe_load("language_summary.parquet") top_talent = safe_load("top_talent.parquet") yoy_trend = safe_load("yoy_trend.parquet") franchises = safe_load("franchises.parquet") tv_seasons = safe_load("tv_seasons.parquet") num_cols_cp = ["vote_average","vote_count","popularity","imdb_rating","imdb_votes", "rt_score","metacritic_score","audience_engagement_score", "budget_usd","revenue_usd","roi","best_weekly_rank"] content_perf = to_num(content_perf, num_cols_cp) return (movies, tv, credits, keywords, content_perf, genre_perf, lang_summary, top_talent, yoy_trend, franchises, tv_seasons) with st.spinner("กำลังโหลดข้อมูล Netflix..."): try: (movies, tv, credits, keywords, content_perf, genre_perf, lang_summary, top_talent, yoy_trend, franchises, tv_seasons) = load_data() except Exception as e: st.error(f"โหลดข้อมูลไม่สำเร็จ: {e}") st.stop() # ───────────────────────────────────────────────────────────────────────────── # SIDEBAR FILTERS # ───────────────────────────────────────────────────────────────────────────── with st.sidebar: st.markdown("""
NETFLIX
Analytics Dashboard 2.0
""", unsafe_allow_html=True) st.markdown("### 🎛️ Filters") # Media type media_type_sel = st.radio("📽️ Media Type", ["All","Movies Only","TV Only"], horizontal=True) # Genres from movies all_genres = sorted({ g for genres in movies["genres"].dropna() for g in (genres if isinstance(genres, list) else []) }) if "genres" in movies.columns else [] selected_genres = st.multiselect("🎭 Genre", all_genres, default=[]) # Year range y_min = int(movies["release_year"].min()) if "release_year" in movies.columns and len(movies)>0 else 1990 y_max = int(movies["release_year"].max()) if "release_year" in movies.columns and len(movies)>0 else 2024 year_range = st.slider("📅 Release Year (Movies)", y_min, y_max, (2010, y_max)) # Language all_langs = [] if "original_language" in movies.columns: all_langs = sorted(movies["original_language"].dropna().unique().tolist()) selected_langs = st.multiselect("🌍 Language", all_langs, default=[]) # Rating range rating_min, rating_max = st.slider("⭐ Rating Range", 0.0, 10.0, (5.0, 10.0), step=0.5) # Min votes min_votes = st.slider("🗳️ Min Vote Count", 0, 5000, 100, step=50) st.markdown("---") st.markdown("### 🔬 Advanced Filters") trending_only = st.checkbox("🔥 Trending Top 50 Only") awards_only = st.checkbox("🏆 Awards Titles Only") has_poster = st.checkbox("🖼️ With Poster Image", value=False) if "budget_tier" in movies.columns: budget_tiers = ["All"] + sorted(movies["budget_tier"].dropna().unique().tolist()) sel_budget = st.selectbox("💰 Budget Tier", budget_tiers) else: sel_budget = "All" st.markdown("---") st.markdown(f"""
TMDB → Airflow → PySpark → HuggingFace
Pipeline updated weekly
""", unsafe_allow_html=True) # ───────────────────────────────────────────────────────────────────────────── # APPLY FILTERS # ───────────────────────────────────────────────────────────────────────────── def apply_movie_filters(df): if df.empty: return df if selected_genres and "genres" in df.columns: df = df[df["genres"].apply(lambda g: bool(set(g or []) & set(selected_genres)))] if "release_year" in df.columns: df = df[df["release_year"].between(*year_range)] if selected_langs and "original_language" in df.columns: df = df[df["original_language"].isin(selected_langs)] if "vote_average" in df.columns: df = df[df["vote_average"].between(rating_min, rating_max)] if "vote_count" in df.columns: df = df[df["vote_count"] >= min_votes] if trending_only and "is_trending_top50" in df.columns: df = df[df["is_trending_top50"] == True] if awards_only and "has_awards" in df.columns: df = df[df["has_awards"] == True] if has_poster and "poster_url" in df.columns: df = df[df["poster_url"].notna() & (df["poster_url"] != "None")] if sel_budget != "All" and "budget_tier" in df.columns: df = df[df["budget_tier"] == sel_budget] return df def apply_tv_filters(df): if df.empty: return df if selected_langs and "original_language" in df.columns: df = df[df["original_language"].isin(selected_langs)] if "vote_average" in df.columns: df = df[df["vote_average"].between(rating_min, rating_max)] if "vote_count" in df.columns: df = df[df["vote_count"] >= min_votes] if trending_only and "is_trending_top50" in df.columns: df = df[df["is_trending_top50"] == True] if awards_only and "has_awards" in df.columns: df = df[df["has_awards"] == True] if has_poster and "poster_url" in df.columns: df = df[df["poster_url"].notna() & (df["poster_url"] != "None")] return df movies_f = apply_movie_filters(movies.copy()) tv_f = apply_tv_filters(tv.copy()) show_movies = media_type_sel != "TV Only" show_tv = media_type_sel != "Movies Only" # ───────────────────────────────────────────────────────────────────────────── # HERO HEADER # ───────────────────────────────────────────────────────────────────────────── st.markdown("""
NETFLIX ANALYTICS 2.0
Content Intelligence Dashboard · TMDB + OMDB + Trending Pipeline
""", unsafe_allow_html=True) st.markdown("---") # ───────────────────────────────────────────────────────────────────────────── # MAIN TABS # ───────────────────────────────────────────────────────────────────────────── tab_overview, tab_explorer, tab_engage, tab_genre, tab_movies, tab_tv, tab_talent, tab_trends = st.tabs([ "🏠 Overview", "🔍 Explorer", "📊 Engagement", "🎭 Genres", "🎬 Movies", "📺 TV Shows", "🌟 Talent", "📈 Trends", ]) # ══════════════════════════════════════════════════════════════════════════════ # TAB 1: OVERVIEW # ══════════════════════════════════════════════════════════════════════════════ with tab_overview: sec("📊 KPI OVERVIEW") avg_m = movies_f["vote_average"].mean() if "vote_average" in movies_f.columns and len(movies_f)>0 else 0 avg_tv = tv_f["vote_average"].mean() if "vote_average" in tv_f.columns and len(tv_f)>0 else 0 avg_eng = movies_f["audience_engagement_score"].mean() if "audience_engagement_score" in movies_f.columns and len(movies_f)>0 else 0 total_r = movies_f["revenue_usd"].sum() if "revenue_usd" in movies_f.columns else 0 total_b = movies_f["budget_usd"].sum() if "budget_usd" in movies_f.columns else 0 n_trend = int(movies_f["is_trending_top50"].sum()) if "is_trending_top50" in movies_f.columns else 0 n_award = int(movies_f["has_awards"].sum()) if "has_awards" in movies_f.columns else 0 n_oscar = int(movies_f["won_oscar"].sum()) if "won_oscar" in movies_f.columns else 0 c1,c2,c3,c4,c5,c6,c7,c8 = st.columns(8) with c1: kpi("🎬",f"{len(movies_f):,}","Movies",accent=NF_RED) with c2: kpi("📺",f"{len(tv_f):,}","TV Shows",accent=PURPLE) with c3: kpi("⭐",f"{avg_m:.2f}","Avg Movie Rating", delta=f"TV: {avg_tv:.2f}", dt="pos" if avg_m>=avg_tv else "neg", accent=GOLD) with c4: kpi("🔥",f"{avg_eng:.1f}","Avg Engagement",accent=NF_RED) with c5: kpi("💰",f"${total_r/1e9:.1f}B","Total Revenue", delta=f"ROI {total_r/max(total_b,1):.1f}x" if total_b>0 else None, dt="pos", accent=GREEN) with c6: kpi("📈",f"{n_trend:,}","Trending Movies",accent=TEAL) with c7: kpi("🏆",f"{n_award:,}","Award Winners",accent=GOLD) with c8: kpi("🎭",f"{n_oscar:,}","Oscar Winners",accent=GOLD) st.markdown("---") # Trending gallery with posters sec("🔥 TRENDING NOW — TOP PICKS WITH POSTERS") col_tl, col_tr = st.columns([3, 1], gap="large") with col_tl: if "best_weekly_rank" in movies_f.columns: trending_movies = (movies_f[movies_f["best_weekly_rank"].notna()] .sort_values("best_weekly_rank") .head(20)) if not trending_movies.empty: st.markdown("**🎬 Trending Movies (Weekly Rank)**") poster_gallery(trending_movies, title_col="title", year_col="release_year", rating_col="vote_average", poster_col="poster_url", badge_col="best_weekly_rank", badge_label="Rank", max_cards=12) else: st.markdown("**🎬 Top Rated Movies**") top_m = (movies_f[movies_f["vote_count"]>=200] .nlargest(12,"vote_average") .reset_index(drop=True)) poster_gallery(top_m, max_cards=12) else: st.markdown("**🎬 Top Rated Movies**") top_m = (movies_f.nlargest(12,"vote_average") if "vote_average" in movies_f.columns else movies_f.head(12)) poster_gallery(top_m, max_cards=12) with col_tr: sec("📌 HIGHLIGHTS") if not movies_f.empty and "vote_average" in movies_f.columns: vc_col = movies_f["vote_count"] if "vote_count" in movies_f.columns else pd.Series([999]*len(movies_f)) best = movies_f[vc_col>=200].nlargest(1,"vote_average") if "vote_count" in movies_f.columns else movies_f.nlargest(1,"vote_average") if not best.empty: best = best.iloc[0] insight(f"🥇 Best Rated: {best.get('title','')}
⭐ {best.get('vote_average',0):.1f}/10", "red") if "audience_engagement_score" in movies_f.columns and not movies_f.empty: top_eng = movies_f.nlargest(1,"audience_engagement_score") if not top_eng.empty: te = top_eng.iloc[0] insight(f"🔥 Highest Engagement: {te.get('title','')}
Score: {te.get('audience_engagement_score',0):.1f}", "teal") if "won_oscar" in movies_f.columns: oscars = movies_f[movies_f["won_oscar"]==True] if not oscars.empty: o = oscars.nlargest(1,"vote_average").iloc[0] insight(f"🏆 Oscar Winner: {o.get('title','')}
⭐ {o.get('vote_average',0):.1f}", "gold") if "imdb_votes" in movies_f.columns and not movies_f.empty: most_voted = movies_f.nlargest(1,"imdb_votes") if not most_voted.empty: mv = most_voted.iloc[0] votes = mv.get("imdb_votes",0) insight(f"👥 Most Watched (IMDb Votes):
{mv.get('title','')}
{votes/1e6:.1f}M votes", "green") st.markdown("---") # Trending TV gallery if show_tv and "best_weekly_rank" in tv_f.columns: trending_tv = (tv_f[tv_f["best_weekly_rank"].notna()] .sort_values("best_weekly_rank").head(12)) if not trending_tv.empty: st.markdown("**📺 Trending TV Shows**") poster_gallery(trending_tv, title_col="name", year_col="first_air_year", rating_col="vote_average", poster_col="poster_url", badge_col="best_weekly_rank", badge_label="Rank", max_cards=12) st.markdown("---") # Overview charts col_ov1, col_ov2 = st.columns(2, gap="large") with col_ov1: sec("📅 CONTENT OVER TIME") if not yoy_trend.empty and "year" in yoy_trend.columns: yoy_m = yoy_trend[yoy_trend["media_type"]=="movie"].sort_values("year") yoy_tv = yoy_trend[yoy_trend["media_type"]=="tv"].sort_values("year") fig = go.Figure() if not yoy_m.empty: fig.add_trace(go.Scatter(x=yoy_m["year"], y=yoy_m["title_count"], name="Movies", mode="lines+markers", fill="tozeroy", line=dict(color=NF_RED,width=2), fillcolor=hex_rgba(NF_RED,0.12))) if not yoy_tv.empty: fig.add_trace(go.Scatter(x=yoy_tv["year"], y=yoy_tv["title_count"], name="TV Shows", mode="lines+markers", fill="tozeroy", line=dict(color=PURPLE,width=2), fillcolor=hex_rgba(PURPLE,0.12))) fig.update_layout(xaxis_title="Year", yaxis_title="Titles Added") apply_theme(fig) st.plotly_chart(fig, use_container_width=True) elif "release_year" in movies_f.columns: by_year = movies_f.groupby("release_year").size().reset_index(name="count") fig = px.area(by_year, x="release_year", y="count", color_discrete_sequence=[NF_RED]) apply_theme(fig) st.plotly_chart(fig, use_container_width=True) with col_ov2: sec("📊 RATING DISTRIBUTION") fig2 = go.Figure() if show_movies and "vote_average" in movies_f.columns and not movies_f.empty: fig2.add_trace(go.Histogram(x=movies_f["vote_average"].dropna(), name="Movies", nbinsx=25, marker_color=NF_RED, opacity=0.7, histnorm="percent")) if show_tv and "vote_average" in tv_f.columns and not tv_f.empty: fig2.add_trace(go.Histogram(x=tv_f["vote_average"].dropna(), name="TV Shows", nbinsx=25, marker_color=PURPLE, opacity=0.7, histnorm="percent")) fig2.update_layout(barmode="overlay", xaxis_title="Rating", yaxis_title="% of Titles") apply_theme(fig2) st.plotly_chart(fig2, use_container_width=True) # ══════════════════════════════════════════════════════════════════════════════ # TAB 2: CONTENT EXPLORER # ══════════════════════════════════════════════════════════════════════════════ with tab_explorer: sec("🔍 CONTENT EXPLORER") st.markdown("ค้นหาและสำรวจ Content พร้อมรูปปก และคะแนนจากทุกแหล่ง") col_ex1, col_ex2, col_ex3, col_ex4 = st.columns([2,1,1,1]) with col_ex1: search_q = st.text_input("🔎 ค้นหาชื่อ", placeholder="พิมพ์ชื่อหนัง / ซีรีส์...") with col_ex2: ex_type = st.selectbox("Type", ["Movies","TV Shows"]) with col_ex3: ex_sort = st.selectbox("Sort by", [ "vote_average","audience_engagement_score","imdb_votes", "popularity","release_year","revenue_usd","rt_score" ]) with col_ex4: ex_limit = st.selectbox("Show", [24, 48, 96], index=0) # Select working df ex_df = movies_f.copy() if ex_type == "Movies" else tv_f.copy() name_col = "title" if ex_type == "Movies" else "name" year_col = "release_year" if ex_type == "Movies" else "first_air_year" if search_q and name_col in ex_df.columns: ex_df = ex_df[ex_df[name_col].fillna("").str.contains(search_q, case=False, na=False)] # Only sort by columns that exist if ex_sort in ex_df.columns: ex_df = ex_df.sort_values(ex_sort, ascending=False) st.caption(f"พบ {len(ex_df):,} รายการ") # Grid view view_mode = st.radio("📐 View", ["🖼️ Poster Grid", "📋 List View", "🗂️ Detail Card"], horizontal=True) if view_mode == "🖼️ Poster Grid": poster_gallery(ex_df.reset_index(drop=True), title_col=name_col, year_col=year_col, rating_col="vote_average", poster_col="poster_url", max_cards=ex_limit) elif view_mode == "📋 List View": show_cols = safe_cols(ex_df, [name_col, year_col, "vote_average","imdb_rating","rt_score","metacritic_score", "audience_engagement_score","imdb_votes","popularity", "original_language","rating_bucket","imdb_votes_tier"]) st.dataframe(ex_df[show_cols].head(ex_limit).reset_index(drop=True), use_container_width=True, height=500) else: # Detail Card detail_sel = st.selectbox("เลือก Title", ex_df[name_col].dropna().head(200).tolist() if name_col in ex_df.columns else []) if detail_sel and name_col in ex_df.columns: row = ex_df[ex_df[name_col]==detail_sel].iloc[0] col_dc1, col_dc2 = st.columns([1,3], gap="large") with col_dc1: poster = row.get("poster_url","") if poster and poster != "None": st.image(poster, width=220) else: st.markdown('
No Image
', unsafe_allow_html=True) if row.get("backdrop_url") and row["backdrop_url"] != "None": with st.expander("🖼️ Backdrop"): st.image(row["backdrop_url"], use_container_width=True) with col_dc2: title_display = row.get(name_col,"") yr = row.get(year_col,"") lang = row.get("original_language","") st.markdown(f'
{title_display}
', unsafe_allow_html=True) st.markdown(f'
📅 {yr}  |  🌍 {lang.upper() if lang else ""}
', unsafe_allow_html=True) st.markdown("**Multi-Source Scores:**") score_pills( tmdb=row.get("vote_average"), imdb=row.get("imdb_rating"), rt=row.get("rt_score"), mc=row.get("metacritic_score") ) # Tags tag_html = "" genres = row.get("genres",[]) or [] for g in (genres[:5] if isinstance(genres,list) else []): tag_html += f'{g}' for status_field in ["status","renewal_signal","critic_audience_verdict"]: if row.get(status_field): tag_html += f'{row[status_field]}' if row.get("has_awards"): tag_html += '🏆 Award Winner' if row.get("won_oscar"): tag_html += '🎭 Oscar Winner' if tag_html: st.markdown(tag_html, unsafe_allow_html=True) # Overview overview = row.get("overview","") if overview and overview != "None": st.markdown("---") st.markdown(f"**📝 Overview**") st.markdown(f'
{overview}
', unsafe_allow_html=True) # Stats st.markdown("---") col_s1, col_s2, col_s3, col_s4 = st.columns(4) with col_s1: votes = row.get("vote_count",0) or 0 st.metric("TMDB Votes", f"{int(votes):,}" if pd.notna(votes) else "N/A") with col_s2: iv = row.get("imdb_votes",0) or 0 st.metric("IMDb Votes", f"{int(iv):,}" if pd.notna(iv) else "N/A") with col_s3: eng = row.get("audience_engagement_score") st.metric("Engagement", f"{eng:.1f}" if pd.notna(eng) else "N/A") with col_s4: pop = row.get("popularity") st.metric("Popularity", f"{pop:.0f}" if pd.notna(pop) else "N/A") # Movie-specific if ex_type == "Movies": col_m1, col_m2, col_m3 = st.columns(3) with col_m1: rev = row.get("revenue_usd",0) or 0 st.metric("Revenue", f"${rev/1e6:.0f}M" if rev>0 else "N/A") with col_m2: roi = row.get("roi") st.metric("ROI", f"{roi:.1f}x" if pd.notna(roi) else "N/A") with col_m3: rt = row.get("runtime_min") st.metric("Runtime", f"{int(rt)} min" if pd.notna(rt) else "N/A") else: col_m1, col_m2, col_m3 = st.columns(3) with col_m1: s = row.get("number_of_seasons") st.metric("Seasons", f"{int(s)}" if pd.notna(s) else "N/A") with col_m2: e = row.get("number_of_episodes") st.metric("Episodes", f"{int(e)}" if pd.notna(e) else "N/A") with col_m3: h = row.get("total_content_hours") st.metric("Total Hours", f"{h:.0f}h" if pd.notna(h) else "N/A") # Trailer yk = row.get("trailer_youtube_key") if yk and yk != "None": st.markdown("---") st.markdown(f"**🎬 Trailer**") st.video(f"https://www.youtube.com/watch?v={yk}") # ══════════════════════════════════════════════════════════════════════════════ # TAB 3: ENGAGEMENT & VIEWERSHIP # ══════════════════════════════════════════════════════════════════════════════ with tab_engage: sec("📊 ENGAGEMENT & VIEWERSHIP ANALYTICS") col_e1, col_e2 = st.columns(2, gap="large") with col_e1: st.markdown("**Audience Engagement Score Distribution**") if "audience_engagement_score" in movies_f.columns and not movies_f.empty: fig_eng = go.Figure() if show_movies: fig_eng.add_trace(go.Histogram( x=movies_f["audience_engagement_score"].dropna(), name="Movies", nbinsx=30, marker_color=NF_RED, opacity=0.75, histnorm="percent")) if show_tv and "audience_engagement_score" in tv_f.columns: fig_eng.add_trace(go.Histogram( x=tv_f["audience_engagement_score"].dropna(), name="TV Shows", nbinsx=30, marker_color=PURPLE, opacity=0.75, histnorm="percent")) fig_eng.update_layout(barmode="overlay", xaxis_title="Engagement Score", yaxis_title="%") apply_theme(fig_eng, 340) st.plotly_chart(fig_eng, use_container_width=True) with col_e2: st.markdown("**IMDb Votes Tier (Viewership Proxy)**") if "imdb_votes_tier" in movies_f.columns and not movies_f.empty: tier_order = ["Mega (1M+)","Hit (500K+)","Popular (100K+)","Moderate (10K+)","Niche (<10K)"] tier_counts = (movies_f["imdb_votes_tier"].value_counts() .reindex(tier_order, fill_value=0).reset_index()) tier_counts.columns = ["tier","count"] fig_tier = px.bar(tier_counts, x="count", y="tier", orientation="h", color="count", color_continuous_scale=["#1a0505", NF_RED], text="count", labels={"count":"Movies","tier":""}) fig_tier.update_traces(texttemplate="%{text:,}", textposition="outside") fig_tier.update_layout(yaxis={"categoryorder":"array","categoryarray":tier_order[::-1]}, coloraxis_showscale=False) apply_theme(fig_tier, 340) st.plotly_chart(fig_tier, use_container_width=True) st.markdown("---") # Scatter: Engagement vs IMDb Votes col_e3, col_e4 = st.columns([2,1], gap="large") with col_e3: st.markdown("**Engagement Score vs IMDb Votes (Viewership)**") if all(c in movies_f.columns for c in ["audience_engagement_score","imdb_votes","title"]): sc_df = movies_f[ movies_f["imdb_votes"].notna() & movies_f["audience_engagement_score"].notna() ].copy() if not sc_df.empty: sc_df["size_norm"] = sc_df["vote_count"].fillna(100).clip(100, 50000) if "vote_count" in sc_df.columns else 100 fig_sc = px.scatter( sc_df, x="imdb_votes", y="audience_engagement_score", color="vote_average" if "vote_average" in sc_df.columns else "imdb_votes", hover_name="title", size="size_norm", size_max=25, log_x=True, color_continuous_scale=["#6C1F1F", NF_RED, GOLD, GREEN], labels={"imdb_votes":"IMDb Votes (log)","audience_engagement_score":"Engagement Score"}, ) apply_theme(fig_sc, 400) st.plotly_chart(fig_sc, use_container_width=True) with col_e4: st.markdown("**📌 Engagement Insights**") if "audience_engagement_score" in movies_f.columns and not movies_f.empty: avg_eng = movies_f["audience_engagement_score"].mean() top10 = movies_f.nlargest(10,"audience_engagement_score")["audience_engagement_score"].mean() insight(f"Avg Engagement Score: {avg_eng:.1f}/10
Top 10 avg: {top10:.1f}", "red") if "imdb_votes" in movies_f.columns and not movies_f.empty: mega = (movies_f["imdb_votes"] >= 1_000_000).sum() insight(f"Movies ระดับ Mega (1M+ IMDb Votes): {mega} เรื่อง
= ฐานผู้ชมขนาดใหญ่มาก", "teal") if "critic_audience_verdict" in movies_f.columns and not movies_f.empty: verdict_vc = movies_f["critic_audience_verdict"].value_counts() for v, cnt in verdict_vc.items(): insight(f"{v}: {cnt:,} เรื่อง") # Critic vs Audience Divergence st.markdown("---") sec("🎯 CRITIC VS AUDIENCE DIVERGENCE") col_div1, col_div2 = st.columns([3,1], gap="large") with col_div1: if all(c in movies_f.columns for c in ["rt_score","vote_average","title"]): div_df = movies_f[ movies_f["rt_score"].notna() & movies_f["vote_average"].notna() ].copy() div_df["critic_gap"] = (div_df["rt_score"]/10) - div_df["vote_average"] fig_div = px.scatter( div_df.sample(min(500,len(div_df))), x="vote_average", y="rt_score", color="critic_audience_verdict" if "critic_audience_verdict" in div_df.columns else "vote_average", hover_name="title", color_discrete_map={"Critics Favorite":PURPLE,"Audience Favorite":NF_RED,"Consensus":TEAL}, labels={"vote_average":"TMDB Rating","rt_score":"Rotten Tomatoes (%)"}, ) fig_div.add_shape(type="line", x0=0, y0=0, x1=10, y1=100, line=dict(color=NF_BORDER, dash="dash", width=1)) apply_theme(fig_div, 380) st.plotly_chart(fig_div, use_container_width=True) with col_div2: if "critic_audience_verdict" in movies_f.columns and not movies_f.empty: vc = movies_f["critic_audience_verdict"].value_counts() for v, cnt in vc.items(): style = "teal" if "Critics" in str(v) else ("red" if "Audience" in str(v) else "") icon = "🎬" if "Critics" in str(v) else ("🍿" if "Audience" in str(v) else "✅") insight(f"{icon} {v}: {cnt:,} movies", style) # ══════════════════════════════════════════════════════════════════════════════ # TAB 4: GENRE INTELLIGENCE # ══════════════════════════════════════════════════════════════════════════════ with tab_genre: sec("🎭 GENRE INTELLIGENCE") if not genre_perf.empty: col_gp1, col_gp2 = st.columns(2, gap="large") gp_movies = genre_perf[genre_perf["media_type"]=="movie"].copy() if "media_type" in genre_perf.columns else genre_perf.copy() gp_tv = genre_perf[genre_perf["media_type"]=="tv"].copy() if "media_type" in genre_perf.columns else pd.DataFrame() with col_gp1: st.markdown("**🎬 Movie Genres — Viewership (Total IMDb Votes)**") if not gp_movies.empty and "total_imdb_votes" in gp_movies.columns: top_gm = gp_movies.nlargest(15,"total_imdb_votes") fig_gv = px.bar(top_gm, x="total_imdb_votes", y="genre", orientation="h", color="avg_vote_average", color_continuous_scale=["#3D0000",NF_RED,GOLD,GREEN], text="total_imdb_votes", labels={"total_imdb_votes":"Total IMDb Votes","genre":""}) fig_gv.update_traces(texttemplate="%{text:,.0f}", textposition="outside") fig_gv.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=True, coloraxis_colorbar=dict(title="Avg Rating", len=0.6)) apply_theme(fig_gv, 450) st.plotly_chart(fig_gv, use_container_width=True) with col_gp2: st.markdown("**📺 TV Genres — Engagement Score**") if not gp_tv.empty and "avg_engagement" in gp_tv.columns: top_gt = gp_tv.nlargest(15,"avg_engagement") fig_ge = px.bar(top_gt, x="avg_engagement", y="genre", orientation="h", color="avg_engagement", color_continuous_scale=["#1a0040",PURPLE,"#A29BFE"], text="avg_engagement", labels={"avg_engagement":"Avg Engagement Score","genre":""}) fig_ge.update_traces(texttemplate="%{text:.2f}", textposition="outside") fig_ge.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_ge, 450) st.plotly_chart(fig_ge, use_container_width=True) st.markdown("---") # Bubble chart: Volume vs Rating vs Viewership st.markdown("**Genre Bubble: Volume × Rating × Viewership**") if not gp_movies.empty and all(c in gp_movies.columns for c in ["genre","title_count","avg_vote_average","avg_imdb_votes"]): gp_bubble = gp_movies.dropna(subset=["avg_imdb_votes"]).head(20) fig_bub = px.scatter( gp_bubble, x="title_count", y="avg_vote_average", size="avg_imdb_votes", color="genre", hover_name="genre", color_discrete_sequence=CHART_COLORS, size_max=60, labels={"title_count":"Number of Titles","avg_vote_average":"Avg Rating"}, text="genre" ) fig_bub.update_traces(textposition="top center", textfont_size=10) apply_theme(fig_bub, 420) st.plotly_chart(fig_bub, use_container_width=True) else: # Fallback to computed genres col_g1, col_g2 = st.columns(2, gap="large") with col_g1: if "genres" in movies_f.columns and not movies_f.empty: gc = (movies_f.explode("genres").groupby("genres")["title"] .count().reset_index().rename(columns={"title":"count","genres":"genre"}) .sort_values("count",ascending=False).head(15)) fig_gc = px.bar(gc, x="count", y="genre", orientation="h", color="count", color_continuous_scale=["#3D0000",NF_RED], text="count", title="Volume by Genre") fig_gc.update_traces(texttemplate="%{text:,}", textposition="outside") fig_gc.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_gc, 420) st.plotly_chart(fig_gc, use_container_width=True) with col_g2: if "genres" in movies_f.columns and "vote_average" in movies_f.columns and not movies_f.empty: gr = (movies_f.explode("genres").groupby("genres")["vote_average"] .agg(["mean","count"]).reset_index() .rename(columns={"genres":"genre","mean":"avg_rating"}) .query("count >= 10").sort_values("avg_rating",ascending=False).head(15)) fig_gr = px.bar(gr, x="avg_rating", y="genre", orientation="h", color="avg_rating", color_continuous_scale=[NF_RED,GOLD,GREEN], text="avg_rating", title="Quality by Genre") fig_gr.update_traces(texttemplate="%{text:.2f}", textposition="outside") fig_gr.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_gr, 420) st.plotly_chart(fig_gr, use_container_width=True) # Genre × Decade Heatmap st.markdown("---") sec("🗓️ GENRE POPULARITY BY DECADE") if "genres" in movies_f.columns and "release_year" in movies_f.columns and not movies_f.empty: movies_f_dec = movies_f.copy() movies_f_dec["decade_str"] = (movies_f_dec["release_year"] // 10 * 10).astype("Int64").astype(str) + "s" hmap = (movies_f_dec.explode("genres") .groupby(["decade_str","genres"]).size().reset_index(name="count")) if not hmap.empty: top_genres_hmap = hmap.groupby("genres")["count"].sum().nlargest(12).index.tolist() hmap_top = hmap[hmap["genres"].isin(top_genres_hmap)] pivot = hmap_top.pivot(index="genres", columns="decade_str", values="count").fillna(0) fig_hm = px.imshow(pivot, color_continuous_scale=["#0a0a0a",hex_rgba(NF_RED,0.4),NF_RED], labels={"color":"Titles"}, aspect="auto") fig_hm.update_layout(xaxis_title="Decade", yaxis_title="") apply_theme(fig_hm, 380) st.plotly_chart(fig_hm, use_container_width=True) # ══════════════════════════════════════════════════════════════════════════════ # TAB 5: MOVIES ANALYSIS # ══════════════════════════════════════════════════════════════════════════════ with tab_movies: sec("🎬 MOVIES DEEP-DIVE") tab_m1, tab_m2, tab_m3, tab_m4 = st.tabs(["🏆 Top Rated","💰 Box Office","🎭 Multi-Score","🏢 Franchises"]) with tab_m1: col_m1, col_m2 = st.columns([3,1], gap="large") with col_m1: if "vote_average" in movies_f.columns and not movies_f.empty: # ── FIX 2: guard optional OMDB columns ─────────────────────── _m1_want = ["title","vote_average","vote_count","release_year","imdb_rating","has_awards"] _m1_cols = safe_cols(movies_f, _m1_want) vc_series = movies_f["vote_count"] if "vote_count" in movies_f.columns else pd.Series([999]*len(movies_f), index=movies_f.index) top_r = (movies_f[vc_series >= 200] .nlargest(12, "vote_average") [_m1_cols] .reset_index(drop=True)) fig_tr = px.bar(top_r, x="vote_average", y="title", orientation="h", color="vote_average", color_continuous_scale=["#6C1F1F",NF_RED,"#FF8C8C"], text="vote_average", custom_data=safe_cols(top_r, ["vote_count","release_year","imdb_rating"])) fig_tr.update_traces( texttemplate="%{text:.2f}", textposition="outside", hovertemplate="%{y}
Rating: %{x:.2f}") fig_tr.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_tr, 450) st.plotly_chart(fig_tr, use_container_width=True) with col_m2: st.markdown("#### 🖼️ Top Picks") vc_series2 = movies_f["vote_count"] if "vote_count" in movies_f.columns else pd.Series([999]*len(movies_f), index=movies_f.index) top_imgs = (movies_f[vc_series2 >= 200].nlargest(4,"vote_average") if "vote_average" in movies_f.columns else movies_f.head(4)) for _, row in top_imgs.iterrows(): p = row.get("poster_url","") if p and p != "None": st.image(p, width=150, caption=str(row.get("title",""))[:30]) with tab_m2: if all(c in movies_f.columns for c in ["budget_usd","revenue_usd","title"]): sc_bo = movies_f[(movies_f["budget_usd"]>1e6)&(movies_f["revenue_usd"]>1e6)].copy() if not sc_bo.empty: col_b1, col_b2 = st.columns([3,1], gap="large") with col_b1: fig_bo = px.scatter( sc_bo, x="budget_usd", y="revenue_usd", color="roi" if "roi" in sc_bo.columns else "vote_average", size="vote_count" if "vote_count" in sc_bo.columns else None, hover_name="title", color_continuous_scale=["#6C1F1F",NF_RED,GOLD,GREEN], log_x=True, log_y=True, labels={"budget_usd":"Budget (USD)","revenue_usd":"Revenue (USD)"}, ) mx = max(sc_bo["budget_usd"].max(), sc_bo["revenue_usd"].max()) fig_bo.add_shape(type="line",x0=1e6,y0=1e6,x1=mx,y1=mx, line=dict(color="#444",dash="dash",width=1)) apply_theme(fig_bo, 450) st.plotly_chart(fig_bo, use_container_width=True) with col_b2: st.markdown("#### 💰 Box Office") top_rev = sc_bo.nlargest(5,"revenue_usd") for _, r in top_rev.iterrows(): p = r.get("poster_url","") col_pi, col_ti = st.columns([1,2]) with col_pi: if p and p!="None": st.image(p, width=60) with col_ti: st.markdown(f"**{str(r.get('title',''))[:20]}**") st.caption(f"${r.get('revenue_usd',0)/1e9:.1f}B") # Budget tier breakdown if "budget_tier" in movies_f.columns: st.markdown("---") bt = movies_f["budget_tier"].value_counts().reset_index() bt.columns = ["tier","count"] col_bt1, col_bt2 = st.columns(2) with col_bt1: fig_bt = px.pie(bt, names="tier", values="count", hole=0.5, color_discrete_sequence=CHART_COLORS, title="Movies by Budget Tier") fig_bt.update_traces(textinfo="percent+label") apply_theme(fig_bt, 300) st.plotly_chart(fig_bt, use_container_width=True) with col_bt2: if "roi" in movies_f.columns: roi_by_tier = (movies_f.groupby("budget_tier")["roi"] .mean().reset_index().rename(columns={"roi":"avg_roi"})) fig_rt = px.bar(roi_by_tier, x="budget_tier", y="avg_roi", color="avg_roi", color_continuous_scale=["#3D0000",NF_RED,GREEN], text="avg_roi", title="Avg ROI by Budget Tier") fig_rt.update_traces(texttemplate="%{text:.1f}x", textposition="outside") fig_rt.update_layout(coloraxis_showscale=False) apply_theme(fig_rt, 300) st.plotly_chart(fig_rt, use_container_width=True) with tab_m3: st.markdown("**Multi-Score Comparison: TMDB vs IMDb vs RT vs Metacritic**") if all(c in movies_f.columns for c in ["vote_average","imdb_rating"]): multi_df = movies_f.dropna(subset=["vote_average","imdb_rating"]).copy() if not multi_df.empty: multi_sample = multi_df.nlargest(50,"vote_count") if "vote_count" in multi_df.columns else multi_df.head(50) fig_ms = go.Figure() x_titles = multi_sample["title"].str[:25].tolist() for col_name, color, label in [ ("vote_average", TEAL, "TMDB (×10)"), ("imdb_rating", GOLD, "IMDb (×10)"), ]: if col_name in multi_sample.columns: fig_ms.add_trace(go.Bar( name=label, x=x_titles, y=multi_sample[col_name]*10, marker_color=color, opacity=0.8 )) if "rt_score" in multi_sample.columns: fig_ms.add_trace(go.Bar( name="RT Score", x=x_titles, y=multi_sample["rt_score"].fillna(0), marker_color=NF_RED, opacity=0.8 )) fig_ms.update_layout(barmode="group", xaxis_tickangle=-45, xaxis_title="", yaxis_title="Score (normalized to 100)") apply_theme(fig_ms, 450) st.plotly_chart(fig_ms, use_container_width=True) else: st.info("Multi-score comparison ต้องการข้อมูล OMDB — รัน fetch_omdb_enrichment ใน pipeline ก่อน") with tab_m4: if not franchises.empty: st.markdown("**🏢 Top Franchises by Total Revenue**") col_f1, col_f2 = st.columns([2,1], gap="large") with col_f1: top_fr = franchises.nlargest(15,"total_revenue_usd") if "total_revenue_usd" in franchises.columns else franchises.head(15) fig_fr = px.bar(top_fr, x="total_revenue_usd", y="collection_name", orientation="h", color="movie_count", color_continuous_scale=["#1a0010",PINK], text="total_revenue_usd", labels={"total_revenue_usd":"Total Revenue (USD)","collection_name":""}) fig_fr.update_traces(texttemplate="$%{text:,.0f}", textposition="outside") fig_fr.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_fr, 450) st.plotly_chart(fig_fr, use_container_width=True) with col_f2: if "franchise_roi" in franchises.columns: top_roi_fr = franchises.dropna(subset=["franchise_roi"]).nlargest(8,"franchise_roi") fig_froi = px.bar(top_roi_fr, x="franchise_roi", y="collection_name", orientation="h", color="franchise_roi", color_continuous_scale=[NF_RED,GOLD,GREEN], text="franchise_roi", title="Best ROI Franchises") fig_froi.update_traces(texttemplate="%{text:.1f}x", textposition="outside") fig_froi.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_froi, 450) st.plotly_chart(fig_froi, use_container_width=True) else: st.info("ไม่มีข้อมูล Franchise — รัน Pipeline ใหม่เพื่อสร้าง franchises.parquet") # ══════════════════════════════════════════════════════════════════════════════ # TAB 6: TV SHOWS # ══════════════════════════════════════════════════════════════════════════════ with tab_tv: sec("📺 TV SHOWS DEEP-DIVE") tab_tv1, tab_tv2, tab_tv3 = st.tabs(["🏆 Top Rated","📊 Structure","🌱 Renewal Signal"]) with tab_tv1: col_tv1, col_tv2 = st.columns([3,1], gap="large") with col_tv1: if "vote_average" in tv_f.columns and "name" in tv_f.columns and not tv_f.empty: # ── FIX 3: guard optional OMDB columns in TV ───────────────── _tv1_want = ["name","vote_average","vote_count","number_of_seasons","imdb_rating"] _tv1_cols = safe_cols(tv_f, _tv1_want) vc_tv = tv_f["vote_count"] if "vote_count" in tv_f.columns else pd.Series([999]*len(tv_f), index=tv_f.index) top_tv_r = (tv_f[vc_tv >= 100] .nlargest(12, "vote_average") [_tv1_cols] .reset_index(drop=True)) fig_tvr = px.bar(top_tv_r, x="vote_average", y="name", orientation="h", color="vote_average", color_continuous_scale=["#1a0040",PURPLE,"#A29BFE"], text="vote_average") fig_tvr.update_traces( texttemplate="%{text:.2f}", textposition="outside", hovertemplate="%{y}
Rating: %{x:.2f}") fig_tvr.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_tvr, 450) st.plotly_chart(fig_tvr, use_container_width=True) with col_tv2: st.markdown("#### 🖼️ Top Picks") vc_tv2 = tv_f["vote_count"] if "vote_count" in tv_f.columns else pd.Series([999]*len(tv_f), index=tv_f.index) tv_top_imgs = (tv_f[vc_tv2 >= 100].nlargest(4,"vote_average") if "vote_average" in tv_f.columns else tv_f.head(4)) for _, row in tv_top_imgs.iterrows(): p = row.get("poster_url","") if p and p != "None": st.image(p, width=150, caption=str(row.get("name",""))[:25]) with tab_tv2: col_ts1, col_ts2, col_ts3 = st.columns(3, gap="large") with col_ts1: if "status" in tv_f.columns and not tv_f.empty: sc = tv_f["status"].value_counts().reset_index() sc.columns = ["status","count"] colors_map = {"Returning Series":GREEN,"Ended":NF_RED,"Canceled":"#E17055", "In Production":TEAL,"Planned":PURPLE} fig_st = px.pie(sc, names="status", values="count", hole=0.55, color="status", color_discrete_map=colors_map, title="TV Status") fig_st.update_traces(textinfo="percent+label", textfont_size=10) apply_theme(fig_st, 320) st.plotly_chart(fig_st, use_container_width=True) with col_ts2: if "number_of_seasons" in tv_f.columns and not tv_f.empty: sd = (tv_f["number_of_seasons"].dropna().astype(int) .value_counts().sort_index().reset_index()) sd.columns = ["seasons","count"] sd = sd[sd["seasons"] <= 20] fig_sd = px.bar(sd, x="seasons", y="count", color="count", color_continuous_scale=["#1a0040",PURPLE], text="count", title="Seasons Distribution") fig_sd.update_traces(texttemplate="%{text}", textposition="outside") fig_sd.update_layout(coloraxis_showscale=False, bargap=0.3) apply_theme(fig_sd, 320) st.plotly_chart(fig_sd, use_container_width=True) with col_ts3: if "episode_format" in tv_f.columns and not tv_f.empty: ef = tv_f["episode_format"].value_counts().reset_index() ef.columns = ["format","count"] fig_ef = px.pie(ef, names="format", values="count", hole=0.5, color_discrete_sequence=CHART_COLORS, title="Episode Format") fig_ef.update_traces(textinfo="percent+label", textfont_size=10) apply_theme(fig_ef, 320) st.plotly_chart(fig_ef, use_container_width=True) # Total content hours if "total_content_hours" in tv_f.columns and not tv_f.empty: st.markdown("---") st.markdown("**Total Content Hours (Top 20 shows)**") top_hours = tv_f.nlargest(20,"total_content_hours")[safe_cols(tv_f,["name","total_content_hours","number_of_seasons"])].dropna() fig_hrs = px.bar(top_hours, x="total_content_hours", y="name", orientation="h", color="number_of_seasons" if "number_of_seasons" in top_hours.columns else "total_content_hours", color_continuous_scale=["#1a0040",PURPLE], text="total_content_hours", labels={"total_content_hours":"Total Watch Hours","name":""}) fig_hrs.update_traces(texttemplate="%{text:.0f}h", textposition="outside") fig_hrs.update_layout(yaxis={"categoryorder":"total ascending"}) apply_theme(fig_hrs, 400) st.plotly_chart(fig_hrs, use_container_width=True) # TV Seasons detail if not tv_seasons.empty and "season_number" in tv_seasons.columns: st.markdown("---") sec("🗓️ SEASON-LEVEL ANALYSIS") if "name" in tv_seasons.columns: sel_show = st.selectbox("เลือก TV Show", tv_seasons["name"].dropna().unique().tolist()[:100]) show_seasons = tv_seasons[tv_seasons["name"]==sel_show].sort_values("season_number") if not show_seasons.empty: col_ss1, col_ss2 = st.columns(2) with col_ss1: if "episode_count" in show_seasons.columns: fig_ssn = px.bar(show_seasons, x="season_number", y="episode_count", color="vote_average" if "vote_average" in show_seasons.columns else "season_number", color_continuous_scale=[PURPLE,"#A29BFE"], text="episode_count", labels={"season_number":"Season","episode_count":"Episodes"}) fig_ssn.update_traces(texttemplate="%{text}", textposition="outside") fig_ssn.update_layout(coloraxis_showscale=False) apply_theme(fig_ssn, 280) st.plotly_chart(fig_ssn, use_container_width=True) with col_ss2: disp_cols = safe_cols(show_seasons, ["season_number","season_name","air_date","episode_count","vote_average","season_position"]) st.dataframe(show_seasons[disp_cols].reset_index(drop=True), use_container_width=True, height=280) with tab_tv3: if "renewal_signal" in tv_f.columns and not tv_f.empty: rs = tv_f["renewal_signal"].value_counts().reset_index() rs.columns = ["signal","count"] colors_rs = { "Renewed/Ongoing":GREEN,"Strong Candidate":TEAL, "Possible":GOLD,"Unlikely / Ended":NF_RED } col_rs1, col_rs2 = st.columns([2,1], gap="large") with col_rs1: fig_rs = px.bar(rs, x="count", y="signal", orientation="h", color="signal", color_discrete_map=colors_rs, text="count", title="TV Shows Renewal Likelihood") fig_rs.update_traces(texttemplate="%{text:,}", textposition="outside") fig_rs.update_layout(yaxis={"categoryorder":"total ascending"}, showlegend=False) apply_theme(fig_rs, 320) st.plotly_chart(fig_rs, use_container_width=True) st.markdown("**📺 Strong Candidates — Shows Likely to Return**") strong = tv_f[tv_f["renewal_signal"]=="Strong Candidate"].nlargest(12,"vote_average") if "vote_average" in tv_f.columns else tv_f[tv_f["renewal_signal"]=="Strong Candidate"].head(12) if not strong.empty: poster_gallery(strong, title_col="name", year_col="first_air_year", rating_col="vote_average", poster_col="poster_url", max_cards=10) with col_rs2: for signal, style in [("Renewed/Ongoing","green"),("Strong Candidate","teal"), ("Possible","gold"),("Unlikely / Ended","red")]: cnt = tv_f[tv_f["renewal_signal"]==signal].shape[0] insight(f"{signal}: {cnt:,} shows", style) else: st.info("ไม่มีข้อมูล renewal_signal — ต้องรัน PySpark pipeline ใหม่") # ══════════════════════════════════════════════════════════════════════════════ # TAB 7: TALENT # ══════════════════════════════════════════════════════════════════════════════ with tab_talent: sec("🌟 TALENT & CREDITS") if not top_talent.empty: col_tt1, col_tt2 = st.columns([3,1], gap="large") cast_talent = top_talent[top_talent["role"]=="cast"].copy() if "role" in top_talent.columns else top_talent.copy() crew_talent = top_talent[top_talent["role"]=="crew"].copy() if "role" in top_talent.columns else pd.DataFrame() with col_tt1: st.markdown("**🎭 Most Impactful Cast — Avg Engagement Score**") if not cast_talent.empty and "avg_content_engagement" in cast_talent.columns: top_cast_t = cast_talent.nlargest(15,"avg_content_engagement") color_col = "total_imdb_votes_across_titles" if "total_imdb_votes_across_titles" in top_cast_t.columns else "avg_content_engagement" fig_ct = px.bar(top_cast_t, x="avg_content_engagement", y="name", orientation="h", color=color_col, color_continuous_scale=["#001433",TEAL], text="avg_content_engagement", labels={"avg_content_engagement":"Avg Engagement","name":""}) fig_ct.update_traces(texttemplate="%{text:.2f}", textposition="outside") fig_ct.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_ct, 460) st.plotly_chart(fig_ct, use_container_width=True) with col_tt2: st.markdown("**🖼️ Top Talent Profiles**") top5_cast = cast_talent.nlargest(8,"avg_content_engagement") if not cast_talent.empty and "avg_content_engagement" in cast_talent.columns else cast_talent.head(8) cards_html = '
' for _, row in top5_cast.iterrows(): profile = row.get("profile_url","") if not profile or profile == "None": profile = "https://via.placeholder.com/72x72/181818/555?text=👤" name = str(row.get("name",""))[:20] eng = row.get("avg_content_engagement",0) tc = row.get("title_count",0) cards_html += f"""
{name}
Score: {eng:.1f} · {tc} titles
""" cards_html += "
" st.markdown(cards_html, unsafe_allow_html=True) st.markdown("---") if not crew_talent.empty: st.markdown("**🎬 Top Directors & Producers**") col_cr1, col_cr2 = st.columns([2,2], gap="large") with col_cr1: top_crew = crew_talent.nlargest(12,"avg_content_engagement") if "avg_content_engagement" in crew_talent.columns else crew_talent.head(12) fig_crew = px.bar(top_crew, x="avg_content_engagement", y="name", orientation="h", color="title_count" if "title_count" in top_crew.columns else "avg_content_engagement", color_continuous_scale=["#1a0a20",PURPLE], text="avg_content_engagement", labels={"avg_content_engagement":"Avg Engagement","name":""}) fig_crew.update_traces(texttemplate="%{text:.2f}", textposition="outside") fig_crew.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_crew, 400) st.plotly_chart(fig_crew, use_container_width=True) with col_cr2: if "gender" in cast_talent.columns and not cast_talent.empty: gen_dist = cast_talent["gender"].value_counts().reset_index() gen_dist.columns = ["gender","count"] fig_gen = px.pie(gen_dist, names="gender", values="count", hole=0.55, title="Cast Gender Distribution", color="gender", color_discrete_map={"Female":TEAL,"Male":PURPLE,"Unknown":NF_BORDER}) fig_gen.update_traces(textinfo="percent+label", textfont_size=12) apply_theme(fig_gen, 380) st.plotly_chart(fig_gen, use_container_width=True) else: # Fallback to credits col_c1, col_c2 = st.columns([3,1], gap="large") with col_c1: if not credits.empty and "role" in credits.columns: top_cast = (credits[credits["role"]=="cast"] .groupby("name").size().reset_index(name="appearances") .nlargest(15,"appearances")) fig_cast = px.bar(top_cast, x="appearances", y="name", orientation="h", color="appearances", color_continuous_scale=["#001433",TEAL], text="appearances", labels={"appearances":"Appearances","name":""}) fig_cast.update_traces(texttemplate="%{text}", textposition="outside") fig_cast.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_cast, 450) st.plotly_chart(fig_cast, use_container_width=True) with col_c2: if "gender" in credits.columns and not credits.empty: gd = credits[credits["role"]=="cast"]["gender"].value_counts().reset_index() gd.columns = ["gender","count"] fig_g = px.pie(gd, names="gender", values="count", hole=0.55, color="gender", color_discrete_map={"Female":TEAL,"Male":PURPLE,"Unknown":"#333"}, title="Gender Distribution") fig_g.update_traces(textinfo="percent+label") apply_theme(fig_g, 300) st.plotly_chart(fig_g, use_container_width=True) # Keywords treemap st.markdown("---") sec("🔑 TRENDING THEMES & KEYWORDS") if not keywords.empty and "keyword" in keywords.columns: col_kw1, col_kw2 = st.columns([3,1], gap="large") with col_kw1: top_kw = keywords.groupby("keyword").size().reset_index(name="count").nlargest(30,"count") fig_kw = px.treemap(top_kw, path=["keyword"], values="count", color="count", color_continuous_scale=["#200000","#6C1F1F",NF_RED], title="Top 30 Content Themes") fig_kw.update_traces(textfont=dict(size=12,family="DM Sans")) apply_theme(fig_kw, 400) st.plotly_chart(fig_kw, use_container_width=True) with col_kw2: top5_kw = top_kw.head(5) for _, r in top5_kw.iterrows(): insight(f"🔑 '{r['keyword']}' — {r['count']:,} titles", "red") # ══════════════════════════════════════════════════════════════════════════════ # TAB 8: YEAR-OVER-YEAR TRENDS # ══════════════════════════════════════════════════════════════════════════════ with tab_trends: sec("📈 YEAR-OVER-YEAR TRENDS") if not yoy_trend.empty and "year" in yoy_trend.columns: yoy_m = yoy_trend[yoy_trend["media_type"]=="movie"].sort_values("year") if "media_type" in yoy_trend.columns else yoy_trend.sort_values("year") yoy_tv = yoy_trend[yoy_trend["media_type"]=="tv"].sort_values("year") if "media_type" in yoy_trend.columns else pd.DataFrame() col_yr1, col_yr2 = st.columns(2, gap="large") with col_yr1: st.markdown("**Avg Rating Over Time**") fig_yr1 = go.Figure() if not yoy_m.empty and "avg_rating" in yoy_m.columns: fig_yr1.add_trace(go.Scatter(x=yoy_m["year"], y=yoy_m["avg_rating"], name="Movies", mode="lines+markers", line=dict(color=NF_RED,width=2.5), marker_size=5)) if not yoy_tv.empty and "avg_rating" in yoy_tv.columns: fig_yr1.add_trace(go.Scatter(x=yoy_tv["year"], y=yoy_tv["avg_rating"], name="TV Shows", mode="lines+markers", line=dict(color=PURPLE,width=2.5), marker_size=5)) fig_yr1.update_layout(xaxis_title="Year", yaxis_title="Avg Rating") apply_theme(fig_yr1, 320) st.plotly_chart(fig_yr1, use_container_width=True) with col_yr2: st.markdown("**Avg Engagement Over Time**") fig_yr2 = go.Figure() if not yoy_m.empty and "avg_engagement" in yoy_m.columns: fig_yr2.add_trace(go.Scatter(x=yoy_m["year"], y=yoy_m["avg_engagement"], name="Movies", mode="lines", fill="tozeroy", line=dict(color=NF_RED,width=2), fillcolor=hex_rgba(NF_RED,0.12))) if not yoy_tv.empty and "avg_engagement" in yoy_tv.columns: fig_yr2.add_trace(go.Scatter(x=yoy_tv["year"], y=yoy_tv["avg_engagement"], name="TV Shows", mode="lines", fill="tozeroy", line=dict(color=PURPLE,width=2), fillcolor=hex_rgba(PURPLE,0.12))) fig_yr2.update_layout(xaxis_title="Year", yaxis_title="Avg Engagement Score") apply_theme(fig_yr2, 320) st.plotly_chart(fig_yr2, use_container_width=True) st.markdown("---") col_yr3, col_yr4 = st.columns(2, gap="large") with col_yr3: if "trending_titles" in yoy_m.columns and not yoy_m.empty: st.markdown("**Trending Titles by Year**") fig_tr = px.bar(yoy_m, x="year", y="trending_titles", color="trending_titles", color_continuous_scale=[hex_rgba(TEAL,0.3),TEAL], labels={"trending_titles":"Trending Titles","year":"Year"}) fig_tr.update_layout(coloraxis_showscale=False) apply_theme(fig_tr, 280) st.plotly_chart(fig_tr, use_container_width=True) with col_yr4: if "awarded_titles" in yoy_m.columns and not yoy_m.empty: st.markdown("**Award Winners by Year**") fig_aw = px.bar(yoy_m, x="year", y="awarded_titles", color="awarded_titles", color_continuous_scale=[hex_rgba(GOLD,0.3),GOLD], labels={"awarded_titles":"Award Winners","year":"Year"}) fig_aw.update_layout(coloraxis_showscale=False) apply_theme(fig_aw, 280) st.plotly_chart(fig_aw, use_container_width=True) if "avg_imdb_votes" in yoy_m.columns and not yoy_m.empty: st.markdown("---") st.markdown("**Avg IMDb Votes (Viewership Proxy) Over Time**") fig_iv = go.Figure() fig_iv.add_trace(go.Scatter(x=yoy_m["year"], y=yoy_m["avg_imdb_votes"], name="Movies", mode="lines+markers", line=dict(color=GOLD,width=2.5), marker_size=5, fill="tozeroy", fillcolor=hex_rgba(GOLD,0.1))) if not yoy_tv.empty and "avg_imdb_votes" in yoy_tv.columns: fig_iv.add_trace(go.Scatter(x=yoy_tv["year"], y=yoy_tv["avg_imdb_votes"], name="TV Shows", mode="lines+markers", line=dict(color=PURPLE,width=2.5), marker_size=5)) fig_iv.update_layout(xaxis_title="Year", yaxis_title="Avg IMDb Votes") apply_theme(fig_iv, 320) st.plotly_chart(fig_iv, use_container_width=True) else: st.info("ไม่มีข้อมูล yoy_trend.parquet — แสดงจากข้อมูล filtered แทน") if "release_year" in movies_f.columns and not movies_f.empty: yr_data = movies_f.groupby("release_year").agg( count=("title","count"), avg_rating=("vote_average","mean") ).reset_index() fig_fb = make_subplots(specs=[[{"secondary_y":True}]]) fig_fb.add_trace(go.Bar(x=yr_data["release_year"], y=yr_data["count"], name="Count", marker_color=hex_rgba(NF_RED,0.5)), secondary_y=False) fig_fb.add_trace(go.Scatter(x=yr_data["release_year"], y=yr_data["avg_rating"], name="Avg Rating", line=dict(color=GOLD,width=2.5)), secondary_y=True) fig_fb.update_layout(**PLOTLY_BASE, height=380) st.plotly_chart(fig_fb, use_container_width=True) # Language Summary st.markdown("---") sec("🌍 LANGUAGE & REGION ANALYTICS") if not lang_summary.empty: col_ls1, col_ls2 = st.columns(2, gap="large") with col_ls1: if all(c in lang_summary.columns for c in ["original_language","title_count"]): top_lang = lang_summary.nlargest(15,"title_count").copy() lang_map = {"en":"English","ja":"Japanese","ko":"Korean","fr":"French","es":"Spanish", "de":"German","it":"Italian","pt":"Portuguese","zh":"Chinese","hi":"Hindi", "ru":"Russian","th":"Thai","ar":"Arabic","nl":"Dutch","sv":"Swedish"} top_lang["lang_name"] = top_lang["original_language"].map(lang_map).fillna(top_lang["original_language"]) fig_ls = px.bar(top_lang, x="title_count", y="lang_name", orientation="h", color="avg_popularity" if "avg_popularity" in top_lang.columns else "title_count", color_continuous_scale=["#001a33",TEAL], text="title_count", labels={"title_count":"Titles","lang_name":""}) fig_ls.update_traces(texttemplate="%{text:,}", textposition="outside") fig_ls.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_ls, 380) st.plotly_chart(fig_ls, use_container_width=True) with col_ls2: if "language_group" in lang_summary.columns: lg_grp = lang_summary.groupby("language_group")["title_count"].sum().reset_index() fig_lg = px.pie(lg_grp, names="language_group", values="title_count", hole=0.55, color_discrete_sequence=CHART_COLORS, title="Content by Language Group") fig_lg.update_traces(textinfo="percent+label", textfont_size=11) apply_theme(fig_lg, 380) st.plotly_chart(fig_lg, use_container_width=True) else: if "original_language" in movies_f.columns and not movies_f.empty: lang_cnt = movies_f["original_language"].value_counts().head(12).reset_index() lang_cnt.columns = ["language","count"] fig_lf = px.bar(lang_cnt, x="count", y="language", orientation="h", color="count", color_continuous_scale=["#001a33",TEAL], text="count") fig_lf.update_traces(texttemplate="%{text:,}", textposition="outside") fig_lf.update_layout(yaxis={"categoryorder":"total ascending"}, coloraxis_showscale=False) apply_theme(fig_lf, 380) st.plotly_chart(fig_lf, use_container_width=True) # ───────────────────────────────────────────────────────────────────────────── # RAW DATA EXPLORER # ───────────────────────────────────────────────────────────────────────────── st.markdown("---") with st.expander("🗃️ Raw Data Explorer", expanded=False): tabs_raw = st.tabs(["🎬 Movies","📺 TV","🎭 Credits","🔑 Keywords", "📊 Content Perf","🎭 Genre Perf","🌟 Talent","📈 YoY"]) data_map = [ movies_f, tv_f, credits, keywords, content_perf, genre_perf, top_talent, yoy_trend, ] for i, df in enumerate(data_map): with tabs_raw[i]: st.caption(f"{len(df):,} records") if not df.empty: st.dataframe(df.head(200), use_container_width=True, height=320) # ───────────────────────────────────────────────────────────────────────────── # FOOTER # ───────────────────────────────────────────────────────────────────────────── st.markdown(f"""
NETFLIX ANALYTICS 2.0  ·  TMDB API + OMDB API  ·  AIRFLOW → PYSPARK → HUGGINGFACE
New Tables: content_performance · genre_performance · top_talent · yoy_trend · tv_seasons · language_summary
""", unsafe_allow_html=True)