Spaces:

ESCP
/

RX12-Netflix-Trend-Analytics

Sleeping

File size: 13,528 Bytes

16aed1c

import os
import re
import numpy as np
import pandas as pd
import streamlit as st
import plotly.express as px

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

st.set_page_config(page_title="RX12 - Netflix Hybrid Recommender", layout="wide")

# -------------------------
# Files expected in Space ROOT
# -------------------------
TITLES_PATH = "netflix_titles.csv"
POPULARITY_PATH = "synthetic_title_popularity.csv"

# -------------------------
# Helpers
# -------------------------
def clean_text(s: str) -> str:
    s = "" if pd.isna(s) else str(s)
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

@st.cache_data(show_spinner=False)
def load_titles(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    return df

@st.cache_data(show_spinner=False)
def load_popularity(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    return df

def build_combined_text(df: pd.DataFrame) -> pd.Series:
    """
    Combine rich metadata fields into a single 'bag of words' representation.
    """
    fields = ["type", "director", "cast", "country", "rating", "listed_in", "description"]
    tmp = df.copy()
    for f in fields:
        if f not in tmp.columns:
            tmp[f] = ""
        tmp[f] = tmp[f].fillna("").astype(str).map(clean_text)

    combined = (
        tmp["type"] + " " +
        tmp["listed_in"] + " " +
        tmp["country"] + " " +
        tmp["rating"] + " " +
        tmp["director"] + " " +
        tmp["cast"] + " " +
        tmp["description"]
    ).str.replace(r"\s+", " ", regex=True).str.strip()

    return combined

@st.cache_data(show_spinner=False)
def build_tfidf_and_similarity(df: pd.DataFrame):
    combined = build_combined_text(df)
    vectorizer = TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        min_df=2,
        max_features=50000,
    )
    tfidf = vectorizer.fit_transform(combined)
    sim = cosine_similarity(tfidf, tfidf)
    title_to_idx = {t: i for i, t in enumerate(df["title"].tolist())}
    return sim, title_to_idx

def demand_tier(pop_score: pd.Series) -> pd.Series:
    q1, q2 = pop_score.quantile([0.33, 0.66]).tolist()
    def tier(x):
        if x <= q1: return "Low"
        if x <= q2: return "Mid"
        return "High"
    return pop_score.apply(tier)

def hybrid_recommend(df: pd.DataFrame, sim: np.ndarray, title_to_idx: dict,
                     seed_title: str, top_n: int = 10, alpha: float = 0.7,
                     same_type: bool = True) -> pd.DataFrame:
    """
    Hybrid score = alpha * content_similarity + (1 - alpha) * popularity_norm
    NOTE: This recommendation is computed on the FULL dataset (df),
    not restricted by the Country→Genre selection.
    """
    if seed_title not in title_to_idx:
        return pd.DataFrame()

    idx = title_to_idx[seed_title]
    sim_scores = sim[idx].copy()
    sim_scores[idx] = -1  # drop itself

    pop = df["popularity_norm"].astype(float).to_numpy()
    hybrid = alpha * sim_scores + (1 - alpha) * pop

    seed_type = df.loc[idx, "type"] if "type" in df.columns else None
    candidate_idx = np.arange(len(df))
    if same_type and seed_type is not None:
        candidate_idx = candidate_idx[df["type"].to_numpy() == seed_type]

    ranked = candidate_idx[np.argsort(hybrid[candidate_idx])[::-1]]
    ranked = ranked[:top_n]

    out = df.iloc[ranked].copy()
    out["content_similarity"] = sim_scores[ranked]
    out["hybrid_score"] = hybrid[ranked]

    keep = [
        "title", "type", "release_year", "rating", "listed_in", "country",
        "popularity_score", "popularity_norm", "tier",
        "content_similarity", "hybrid_score"
    ]
    keep = [c for c in keep if c in out.columns]
    return out[keep].sort_values("hybrid_score", ascending=False).reset_index(drop=True)

def split_genres(listed_in: str) -> list:
    if pd.isna(listed_in) or not str(listed_in).strip():
        return []
    return [g.strip() for g in str(listed_in).split(",") if g.strip()]

# -------------------------
# Load + merge
# -------------------------
st.title("RX12 - Netflix Hybrid Recommender")
st.caption("Content-based recommendations (TF‑IDF over metadata & descriptions) blended with popularity priors.")

missing = [p for p in [TITLES_PATH, POPULARITY_PATH] if not os.path.exists(p)]
if missing:
    st.error(
        "Missing required file(s) in Space root directory: "
        + ", ".join(f"`{m}`" for m in missing)
        + ". Upload them under the **Files** tab (root level)."
    )
    st.stop()

titles = load_titles(TITLES_PATH)
pop = load_popularity(POPULARITY_PATH)

if "title" not in titles.columns:
    st.error("`netflix_titles.csv` must contain a `title` column.")
    st.stop()
if not {"title", "popularity_score", "popularity_norm"}.issubset(set(pop.columns)):
    st.error("`synthetic_title_popularity.csv` must contain `title`, `popularity_score`, `popularity_norm`.")
    st.stop()

df = titles.merge(pop, on="title", how="left")
df["popularity_score"] = pd.to_numeric(df["popularity_score"], errors="coerce")
df["popularity_norm"] = pd.to_numeric(df["popularity_norm"], errors="coerce")
df = df.dropna(subset=["popularity_score", "popularity_norm"]).reset_index(drop=True)

df["tier"] = demand_tier(df["popularity_score"])

sim, title_to_idx = build_tfidf_and_similarity(df)

# -------------------------
# Sidebar controls (GLOBAL)
# -------------------------
st.sidebar.header("Controls (Global)")

type_opt = st.sidebar.multiselect(
    "Content type",
    sorted(df["type"].dropna().unique().tolist()),
    default=sorted(df["type"].dropna().unique().tolist())
)

year_min, year_max = int(df["release_year"].min()), int(df["release_year"].max())
year_range = st.sidebar.slider("Release year range", year_min, year_max, (max(year_min, year_max-20), year_max))

rating_opt = st.sidebar.multiselect("Rating", sorted(df["rating"].fillna("Unknown").unique().tolist()), default=[])
tier_opt = st.sidebar.multiselect("Demand tier", ["Low", "Mid", "High"], default=["Low","Mid","High"])

alpha = st.sidebar.slider("Hybrid weight (Similarity → Popularity)", 0.0, 1.0, 0.7, 0.05)
top_n = st.sidebar.slider("Top N recommendations", 5, 30, 10)
same_type = st.sidebar.checkbox("Recommend within same type", value=True)

# Apply global filters for browsing and for seed title selection
f = df.copy()
f = f[f["type"].isin(type_opt)]
f = f[(f["release_year"] >= year_range[0]) & (f["release_year"] <= year_range[1])]
f = f[f["tier"].isin(tier_opt)]
if rating_opt:
    f = f[f["rating"].fillna("Unknown").isin(rating_opt)]
f = f.reset_index(drop=True)

if f.empty:
    st.warning("No titles match your global filters. Adjust filters in the sidebar.")
    st.stop()

seed_title = st.sidebar.selectbox("Pick a seed title", options=sorted(f["title"].unique().tolist()))

tab1, tab2, tab3, tab4 = st.tabs(["Recommend", "Explore Catalog", "Country → Genre", "Explain Method"])

# -------------------------
# Tab 1: Recommendations
# -------------------------
with tab1:
    st.subheader("Recommendations (not affected by Country → Genre selection)")

    seed_row = df[df["title"] == seed_title].head(1)
    if not seed_row.empty:
        sr = seed_row.iloc[0]
        st.markdown(
            f"**Seed title:** `{sr['title']}`  \n"
            f"- Type: **{sr.get('type','')}** | Year: **{sr.get('release_year','')}** | Rating: **{sr.get('rating','')}**  \n"
            f"- Genres: **{sr.get('listed_in','')}**  \n"
            f"- Country: **{sr.get('country','')}**  \n"
        )
        with st.expander("Show description"):
            st.write(sr.get("description", ""))

    recs = hybrid_recommend(df, sim, title_to_idx, seed_title, top_n=top_n, alpha=alpha, same_type=same_type)

    if recs.empty:
        st.info("No recommendations produced. Try another title.")
    else:
        c1, c2 = st.columns([1.25, 1])
        with c1:
            st.dataframe(recs, use_container_width=True, height=420)
        with c2:
            fig = px.scatter(
                recs,
                x="content_similarity",
                y="popularity_norm",
                size="hybrid_score",
                hover_data=["title", "type", "release_year", "tier", "rating"],
            )
            fig.update_layout(
                xaxis_title="Content similarity (cosine TF‑IDF)",
                yaxis_title="Popularity (normalized)",
                height=420
            )
            st.plotly_chart(fig, use_container_width=True)

        st.caption("Tip: move alpha toward 1.0 for 'more similar', toward 0.0 for 'more popular' picks.")

# -------------------------
# Tab 2: Explore catalog (GLOBAL FILTERS ONLY)
# -------------------------
with tab2:
    st.subheader("Interactive catalog exploration (global filters only)")

    c1, c2 = st.columns([1, 1])

    with c1:
        fig = px.histogram(f, x="popularity_score", color="tier", nbins=40, marginal="box")
        fig.update_layout(xaxis_title="Popularity score", yaxis_title="Count", height=420)
        st.plotly_chart(fig, use_container_width=True)

    with c2:
        top = f.sort_values("popularity_score", ascending=False).head(20)
        fig2 = px.bar(top, x="title", y="popularity_score", color="tier")
        fig2.update_layout(xaxis_title="", yaxis_title="Popularity score", height=420)
        st.plotly_chart(fig2, use_container_width=True)

    st.dataframe(
        f[["title","type","release_year","rating","listed_in","country","popularity_score","tier"]].reset_index(drop=True),
        use_container_width=True,
        height=360
    )

# -------------------------
# Tab 3: Country → Genre (COUNTRY SELECTION LIVES HERE ONLY)
# -------------------------
with tab3:
    st.subheader("Country → Genre analysis (affects this tab only)")
    st.caption("Select a production country to see the most common genres and example titles from that market.")

    # Country selector is intentionally inside this tab so it does NOT affect recommendations.
    country_values = (
        df["country"]
        .fillna("")
        .astype(str)
        .str.split(",")
        .explode()
        .str.strip()
    )
    country_values = country_values[country_values != ""]
    all_countries = sorted(country_values.unique().tolist())

    if not all_countries:
        st.warning("No country information found in the dataset.")
    else:
        colA, colB = st.columns([1, 1])
        with colA:
            selected_country = st.selectbox("Select a country", options=all_countries)
        with colB:
            top_genres_n = st.slider("Top N genres", 5, 30, 10)

        # Filter titles that include the selected country
        mask = df["country"].fillna("").astype(str).str.contains(rf"(^|,\s*){re.escape(selected_country)}(\s*,|$)", regex=True)
        dcf = df[mask].copy()

        if dcf.empty:
            st.info("No titles found for this country.")
        else:
            # Genre counts
            genres = dcf["listed_in"].apply(split_genres).explode()
            genres = genres.dropna()
            genre_counts = genres.value_counts().head(top_genres_n).reset_index()
            genre_counts.columns = ["genre", "count"]

            fig = px.bar(genre_counts, x="genre", y="count")
            fig.update_layout(xaxis_title="Genre", yaxis_title="Count", height=420)
            st.plotly_chart(fig, use_container_width=True)

            # Example titles for each top genre (up to 5 per genre)
            examples = []
            top_genres = genre_counts["genre"].tolist()
            for g in top_genres:
                ex = dcf[dcf["listed_in"].fillna("").astype(str).str.contains(re.escape(g), regex=True)]
                ex = ex.sort_values("popularity_score", ascending=False).head(5)
                for _, row in ex.iterrows():
                    examples.append({
                        "genre": g,
                        "title": row.get("title", ""),
                        "type": row.get("type", ""),
                        "release_year": row.get("release_year", ""),
                        "rating": row.get("rating", ""),
                        "popularity_score": row.get("popularity_score", ""),
                    })

            exdf = pd.DataFrame(examples)
            st.markdown("**Example titles (top 5 by popularity within each genre)**")
            st.dataframe(exdf, use_container_width=True, height=360)

# -------------------------
# Tab 4: Explain
# -------------------------
with tab4:
    st.subheader("How recommendations are generated (workshop-aligned)")
    st.markdown(
        """
### 1) Build a content representation
We combine multiple metadata fields into one text document per title:

- type (Movie / TV Show)
- listed_in (genres)
- country
- rating
- director, cast
- description

### 2) TF‑IDF + cosine similarity
We vectorize the combined text using TF‑IDF and compute pairwise cosine similarity between all titles.

### 3) Hybrid ranking
We blend content similarity with a popularity prior:

`hybrid_score = alpha * similarity + (1 - alpha) * popularity_norm`

**Important:** the **Country → Genre** selection is an exploration tool and does **not** restrict the recommender.
        """
    )
    st.code(
        "hybrid_score = alpha * cosine_similarity(TFIDF(combined_text)) + (1 - alpha) * popularity_norm",
        language="python"
    )