import os import re import numpy as np import pandas as pd import streamlit as st import plotly.express as px from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity st.set_page_config(page_title="RX12 - Netflix Hybrid Recommender", layout="wide") # ------------------------- # Files expected in Space ROOT # ------------------------- TITLES_PATH = "netflix_titles.csv" POPULARITY_PATH = "synthetic_title_popularity.csv" # ------------------------- # Helpers # ------------------------- def clean_text(s: str) -> str: s = "" if pd.isna(s) else str(s) s = s.lower() s = re.sub(r"[^a-z0-9\s]", " ", s) s = re.sub(r"\s+", " ", s).strip() return s @st.cache_data(show_spinner=False) def load_titles(path: str) -> pd.DataFrame: df = pd.read_csv(path) df.columns = [c.strip() for c in df.columns] return df @st.cache_data(show_spinner=False) def load_popularity(path: str) -> pd.DataFrame: df = pd.read_csv(path) df.columns = [c.strip() for c in df.columns] return df def build_combined_text(df: pd.DataFrame) -> pd.Series: """ Combine rich metadata fields into a single 'bag of words' representation. """ fields = ["type", "director", "cast", "country", "rating", "listed_in", "description"] tmp = df.copy() for f in fields: if f not in tmp.columns: tmp[f] = "" tmp[f] = tmp[f].fillna("").astype(str).map(clean_text) combined = ( tmp["type"] + " " + tmp["listed_in"] + " " + tmp["country"] + " " + tmp["rating"] + " " + tmp["director"] + " " + tmp["cast"] + " " + tmp["description"] ).str.replace(r"\s+", " ", regex=True).str.strip() return combined @st.cache_data(show_spinner=False) def build_tfidf_and_similarity(df: pd.DataFrame): combined = build_combined_text(df) vectorizer = TfidfVectorizer( stop_words="english", ngram_range=(1, 2), min_df=2, max_features=50000, ) tfidf = vectorizer.fit_transform(combined) sim = cosine_similarity(tfidf, tfidf) title_to_idx = {t: i for i, t in enumerate(df["title"].tolist())} return sim, title_to_idx def demand_tier(pop_score: pd.Series) -> pd.Series: q1, q2 = pop_score.quantile([0.33, 0.66]).tolist() def tier(x): if x <= q1: return "Low" if x <= q2: return "Mid" return "High" return pop_score.apply(tier) def hybrid_recommend(df: pd.DataFrame, sim: np.ndarray, title_to_idx: dict, seed_title: str, top_n: int = 10, alpha: float = 0.7, same_type: bool = True) -> pd.DataFrame: """ Hybrid score = alpha * content_similarity + (1 - alpha) * popularity_norm NOTE: This recommendation is computed on the FULL dataset (df), not restricted by the Country→Genre selection. """ if seed_title not in title_to_idx: return pd.DataFrame() idx = title_to_idx[seed_title] sim_scores = sim[idx].copy() sim_scores[idx] = -1 # drop itself pop = df["popularity_norm"].astype(float).to_numpy() hybrid = alpha * sim_scores + (1 - alpha) * pop seed_type = df.loc[idx, "type"] if "type" in df.columns else None candidate_idx = np.arange(len(df)) if same_type and seed_type is not None: candidate_idx = candidate_idx[df["type"].to_numpy() == seed_type] ranked = candidate_idx[np.argsort(hybrid[candidate_idx])[::-1]] ranked = ranked[:top_n] out = df.iloc[ranked].copy() out["content_similarity"] = sim_scores[ranked] out["hybrid_score"] = hybrid[ranked] keep = [ "title", "type", "release_year", "rating", "listed_in", "country", "popularity_score", "popularity_norm", "tier", "content_similarity", "hybrid_score" ] keep = [c for c in keep if c in out.columns] return out[keep].sort_values("hybrid_score", ascending=False).reset_index(drop=True) def split_genres(listed_in: str) -> list: if pd.isna(listed_in) or not str(listed_in).strip(): return [] return [g.strip() for g in str(listed_in).split(",") if g.strip()] # ------------------------- # Load + merge # ------------------------- st.title("RX12 - Netflix Hybrid Recommender") st.caption("Content-based recommendations (TF‑IDF over metadata & descriptions) blended with popularity priors.") missing = [p for p in [TITLES_PATH, POPULARITY_PATH] if not os.path.exists(p)] if missing: st.error( "Missing required file(s) in Space root directory: " + ", ".join(f"`{m}`" for m in missing) + ". Upload them under the **Files** tab (root level)." ) st.stop() titles = load_titles(TITLES_PATH) pop = load_popularity(POPULARITY_PATH) if "title" not in titles.columns: st.error("`netflix_titles.csv` must contain a `title` column.") st.stop() if not {"title", "popularity_score", "popularity_norm"}.issubset(set(pop.columns)): st.error("`synthetic_title_popularity.csv` must contain `title`, `popularity_score`, `popularity_norm`.") st.stop() df = titles.merge(pop, on="title", how="left") df["popularity_score"] = pd.to_numeric(df["popularity_score"], errors="coerce") df["popularity_norm"] = pd.to_numeric(df["popularity_norm"], errors="coerce") df = df.dropna(subset=["popularity_score", "popularity_norm"]).reset_index(drop=True) df["tier"] = demand_tier(df["popularity_score"]) sim, title_to_idx = build_tfidf_and_similarity(df) # ------------------------- # Sidebar controls (GLOBAL) # ------------------------- st.sidebar.header("Controls (Global)") type_opt = st.sidebar.multiselect( "Content type", sorted(df["type"].dropna().unique().tolist()), default=sorted(df["type"].dropna().unique().tolist()) ) year_min, year_max = int(df["release_year"].min()), int(df["release_year"].max()) year_range = st.sidebar.slider("Release year range", year_min, year_max, (max(year_min, year_max-20), year_max)) rating_opt = st.sidebar.multiselect("Rating", sorted(df["rating"].fillna("Unknown").unique().tolist()), default=[]) tier_opt = st.sidebar.multiselect("Demand tier", ["Low", "Mid", "High"], default=["Low","Mid","High"]) alpha = st.sidebar.slider("Hybrid weight (Similarity → Popularity)", 0.0, 1.0, 0.7, 0.05) top_n = st.sidebar.slider("Top N recommendations", 5, 30, 10) same_type = st.sidebar.checkbox("Recommend within same type", value=True) # Apply global filters for browsing and for seed title selection f = df.copy() f = f[f["type"].isin(type_opt)] f = f[(f["release_year"] >= year_range[0]) & (f["release_year"] <= year_range[1])] f = f[f["tier"].isin(tier_opt)] if rating_opt: f = f[f["rating"].fillna("Unknown").isin(rating_opt)] f = f.reset_index(drop=True) if f.empty: st.warning("No titles match your global filters. Adjust filters in the sidebar.") st.stop() seed_title = st.sidebar.selectbox("Pick a seed title", options=sorted(f["title"].unique().tolist())) tab1, tab2, tab3, tab4 = st.tabs(["Recommend", "Explore Catalog", "Country → Genre", "Explain Method"]) # ------------------------- # Tab 1: Recommendations # ------------------------- with tab1: st.subheader("Recommendations (not affected by Country → Genre selection)") seed_row = df[df["title"] == seed_title].head(1) if not seed_row.empty: sr = seed_row.iloc[0] st.markdown( f"**Seed title:** `{sr['title']}` \n" f"- Type: **{sr.get('type','')}** | Year: **{sr.get('release_year','')}** | Rating: **{sr.get('rating','')}** \n" f"- Genres: **{sr.get('listed_in','')}** \n" f"- Country: **{sr.get('country','')}** \n" ) with st.expander("Show description"): st.write(sr.get("description", "")) recs = hybrid_recommend(df, sim, title_to_idx, seed_title, top_n=top_n, alpha=alpha, same_type=same_type) if recs.empty: st.info("No recommendations produced. Try another title.") else: c1, c2 = st.columns([1.25, 1]) with c1: st.dataframe(recs, use_container_width=True, height=420) with c2: fig = px.scatter( recs, x="content_similarity", y="popularity_norm", size="hybrid_score", hover_data=["title", "type", "release_year", "tier", "rating"], ) fig.update_layout( xaxis_title="Content similarity (cosine TF‑IDF)", yaxis_title="Popularity (normalized)", height=420 ) st.plotly_chart(fig, use_container_width=True) st.caption("Tip: move alpha toward 1.0 for 'more similar', toward 0.0 for 'more popular' picks.") # ------------------------- # Tab 2: Explore catalog (GLOBAL FILTERS ONLY) # ------------------------- with tab2: st.subheader("Interactive catalog exploration (global filters only)") c1, c2 = st.columns([1, 1]) with c1: fig = px.histogram(f, x="popularity_score", color="tier", nbins=40, marginal="box") fig.update_layout(xaxis_title="Popularity score", yaxis_title="Count", height=420) st.plotly_chart(fig, use_container_width=True) with c2: top = f.sort_values("popularity_score", ascending=False).head(20) fig2 = px.bar(top, x="title", y="popularity_score", color="tier") fig2.update_layout(xaxis_title="", yaxis_title="Popularity score", height=420) st.plotly_chart(fig2, use_container_width=True) st.dataframe( f[["title","type","release_year","rating","listed_in","country","popularity_score","tier"]].reset_index(drop=True), use_container_width=True, height=360 ) # ------------------------- # Tab 3: Country → Genre (COUNTRY SELECTION LIVES HERE ONLY) # ------------------------- with tab3: st.subheader("Country → Genre analysis (affects this tab only)") st.caption("Select a production country to see the most common genres and example titles from that market.") # Country selector is intentionally inside this tab so it does NOT affect recommendations. country_values = ( df["country"] .fillna("") .astype(str) .str.split(",") .explode() .str.strip() ) country_values = country_values[country_values != ""] all_countries = sorted(country_values.unique().tolist()) if not all_countries: st.warning("No country information found in the dataset.") else: colA, colB = st.columns([1, 1]) with colA: selected_country = st.selectbox("Select a country", options=all_countries) with colB: top_genres_n = st.slider("Top N genres", 5, 30, 10) # Filter titles that include the selected country mask = df["country"].fillna("").astype(str).str.contains(rf"(^|,\s*){re.escape(selected_country)}(\s*,|$)", regex=True) dcf = df[mask].copy() if dcf.empty: st.info("No titles found for this country.") else: # Genre counts genres = dcf["listed_in"].apply(split_genres).explode() genres = genres.dropna() genre_counts = genres.value_counts().head(top_genres_n).reset_index() genre_counts.columns = ["genre", "count"] fig = px.bar(genre_counts, x="genre", y="count") fig.update_layout(xaxis_title="Genre", yaxis_title="Count", height=420) st.plotly_chart(fig, use_container_width=True) # Example titles for each top genre (up to 5 per genre) examples = [] top_genres = genre_counts["genre"].tolist() for g in top_genres: ex = dcf[dcf["listed_in"].fillna("").astype(str).str.contains(re.escape(g), regex=True)] ex = ex.sort_values("popularity_score", ascending=False).head(5) for _, row in ex.iterrows(): examples.append({ "genre": g, "title": row.get("title", ""), "type": row.get("type", ""), "release_year": row.get("release_year", ""), "rating": row.get("rating", ""), "popularity_score": row.get("popularity_score", ""), }) exdf = pd.DataFrame(examples) st.markdown("**Example titles (top 5 by popularity within each genre)**") st.dataframe(exdf, use_container_width=True, height=360) # ------------------------- # Tab 4: Explain # ------------------------- with tab4: st.subheader("How recommendations are generated (workshop-aligned)") st.markdown( """ ### 1) Build a content representation We combine multiple metadata fields into one text document per title: - type (Movie / TV Show) - listed_in (genres) - country - rating - director, cast - description ### 2) TF‑IDF + cosine similarity We vectorize the combined text using TF‑IDF and compute pairwise cosine similarity between all titles. ### 3) Hybrid ranking We blend content similarity with a popularity prior: `hybrid_score = alpha * similarity + (1 - alpha) * popularity_norm` **Important:** the **Country → Genre** selection is an exploration tool and does **not** restrict the recommender. """ ) st.code( "hybrid_score = alpha * cosine_similarity(TFIDF(combined_text)) + (1 - alpha) * popularity_norm", language="python" )