Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| import plotly.express as px | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| st.set_page_config(page_title="RX12 - Netflix Hybrid Recommender", layout="wide") | |
| # ------------------------- | |
| # Files expected in Space ROOT | |
| # ------------------------- | |
| TITLES_PATH = "netflix_titles.csv" | |
| POPULARITY_PATH = "synthetic_title_popularity.csv" | |
| # ------------------------- | |
| # Helpers | |
| # ------------------------- | |
| def clean_text(s: str) -> str: | |
| s = "" if pd.isna(s) else str(s) | |
| s = s.lower() | |
| s = re.sub(r"[^a-z0-9\s]", " ", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def load_titles(path: str) -> pd.DataFrame: | |
| df = pd.read_csv(path) | |
| df.columns = [c.strip() for c in df.columns] | |
| return df | |
| def load_popularity(path: str) -> pd.DataFrame: | |
| df = pd.read_csv(path) | |
| df.columns = [c.strip() for c in df.columns] | |
| return df | |
| def build_combined_text(df: pd.DataFrame) -> pd.Series: | |
| """ | |
| Combine rich metadata fields into a single 'bag of words' representation. | |
| """ | |
| fields = ["type", "director", "cast", "country", "rating", "listed_in", "description"] | |
| tmp = df.copy() | |
| for f in fields: | |
| if f not in tmp.columns: | |
| tmp[f] = "" | |
| tmp[f] = tmp[f].fillna("").astype(str).map(clean_text) | |
| combined = ( | |
| tmp["type"] + " " + | |
| tmp["listed_in"] + " " + | |
| tmp["country"] + " " + | |
| tmp["rating"] + " " + | |
| tmp["director"] + " " + | |
| tmp["cast"] + " " + | |
| tmp["description"] | |
| ).str.replace(r"\s+", " ", regex=True).str.strip() | |
| return combined | |
| def build_tfidf_and_similarity(df: pd.DataFrame): | |
| combined = build_combined_text(df) | |
| vectorizer = TfidfVectorizer( | |
| stop_words="english", | |
| ngram_range=(1, 2), | |
| min_df=2, | |
| max_features=50000, | |
| ) | |
| tfidf = vectorizer.fit_transform(combined) | |
| sim = cosine_similarity(tfidf, tfidf) | |
| title_to_idx = {t: i for i, t in enumerate(df["title"].tolist())} | |
| return sim, title_to_idx | |
| def demand_tier(pop_score: pd.Series) -> pd.Series: | |
| q1, q2 = pop_score.quantile([0.33, 0.66]).tolist() | |
| def tier(x): | |
| if x <= q1: return "Low" | |
| if x <= q2: return "Mid" | |
| return "High" | |
| return pop_score.apply(tier) | |
| def hybrid_recommend(df: pd.DataFrame, sim: np.ndarray, title_to_idx: dict, | |
| seed_title: str, top_n: int = 10, alpha: float = 0.7, | |
| same_type: bool = True) -> pd.DataFrame: | |
| """ | |
| Hybrid score = alpha * content_similarity + (1 - alpha) * popularity_norm | |
| NOTE: This recommendation is computed on the FULL dataset (df), | |
| not restricted by the Country→Genre selection. | |
| """ | |
| if seed_title not in title_to_idx: | |
| return pd.DataFrame() | |
| idx = title_to_idx[seed_title] | |
| sim_scores = sim[idx].copy() | |
| sim_scores[idx] = -1 # drop itself | |
| pop = df["popularity_norm"].astype(float).to_numpy() | |
| hybrid = alpha * sim_scores + (1 - alpha) * pop | |
| seed_type = df.loc[idx, "type"] if "type" in df.columns else None | |
| candidate_idx = np.arange(len(df)) | |
| if same_type and seed_type is not None: | |
| candidate_idx = candidate_idx[df["type"].to_numpy() == seed_type] | |
| ranked = candidate_idx[np.argsort(hybrid[candidate_idx])[::-1]] | |
| ranked = ranked[:top_n] | |
| out = df.iloc[ranked].copy() | |
| out["content_similarity"] = sim_scores[ranked] | |
| out["hybrid_score"] = hybrid[ranked] | |
| keep = [ | |
| "title", "type", "release_year", "rating", "listed_in", "country", | |
| "popularity_score", "popularity_norm", "tier", | |
| "content_similarity", "hybrid_score" | |
| ] | |
| keep = [c for c in keep if c in out.columns] | |
| return out[keep].sort_values("hybrid_score", ascending=False).reset_index(drop=True) | |
| def split_genres(listed_in: str) -> list: | |
| if pd.isna(listed_in) or not str(listed_in).strip(): | |
| return [] | |
| return [g.strip() for g in str(listed_in).split(",") if g.strip()] | |
| # ------------------------- | |
| # Load + merge | |
| # ------------------------- | |
| st.title("RX12 - Netflix Hybrid Recommender") | |
| st.caption("Content-based recommendations (TF‑IDF over metadata & descriptions) blended with popularity priors.") | |
| missing = [p for p in [TITLES_PATH, POPULARITY_PATH] if not os.path.exists(p)] | |
| if missing: | |
| st.error( | |
| "Missing required file(s) in Space root directory: " | |
| + ", ".join(f"`{m}`" for m in missing) | |
| + ". Upload them under the **Files** tab (root level)." | |
| ) | |
| st.stop() | |
| titles = load_titles(TITLES_PATH) | |
| pop = load_popularity(POPULARITY_PATH) | |
| if "title" not in titles.columns: | |
| st.error("`netflix_titles.csv` must contain a `title` column.") | |
| st.stop() | |
| if not {"title", "popularity_score", "popularity_norm"}.issubset(set(pop.columns)): | |
| st.error("`synthetic_title_popularity.csv` must contain `title`, `popularity_score`, `popularity_norm`.") | |
| st.stop() | |
| df = titles.merge(pop, on="title", how="left") | |
| df["popularity_score"] = pd.to_numeric(df["popularity_score"], errors="coerce") | |
| df["popularity_norm"] = pd.to_numeric(df["popularity_norm"], errors="coerce") | |
| df = df.dropna(subset=["popularity_score", "popularity_norm"]).reset_index(drop=True) | |
| df["tier"] = demand_tier(df["popularity_score"]) | |
| sim, title_to_idx = build_tfidf_and_similarity(df) | |
| # ------------------------- | |
| # Sidebar controls (GLOBAL) | |
| # ------------------------- | |
| st.sidebar.header("Controls (Global)") | |
| type_opt = st.sidebar.multiselect( | |
| "Content type", | |
| sorted(df["type"].dropna().unique().tolist()), | |
| default=sorted(df["type"].dropna().unique().tolist()) | |
| ) | |
| year_min, year_max = int(df["release_year"].min()), int(df["release_year"].max()) | |
| year_range = st.sidebar.slider("Release year range", year_min, year_max, (max(year_min, year_max-20), year_max)) | |
| rating_opt = st.sidebar.multiselect("Rating", sorted(df["rating"].fillna("Unknown").unique().tolist()), default=[]) | |
| tier_opt = st.sidebar.multiselect("Demand tier", ["Low", "Mid", "High"], default=["Low","Mid","High"]) | |
| alpha = st.sidebar.slider("Hybrid weight (Similarity → Popularity)", 0.0, 1.0, 0.7, 0.05) | |
| top_n = st.sidebar.slider("Top N recommendations", 5, 30, 10) | |
| same_type = st.sidebar.checkbox("Recommend within same type", value=True) | |
| # Apply global filters for browsing and for seed title selection | |
| f = df.copy() | |
| f = f[f["type"].isin(type_opt)] | |
| f = f[(f["release_year"] >= year_range[0]) & (f["release_year"] <= year_range[1])] | |
| f = f[f["tier"].isin(tier_opt)] | |
| if rating_opt: | |
| f = f[f["rating"].fillna("Unknown").isin(rating_opt)] | |
| f = f.reset_index(drop=True) | |
| if f.empty: | |
| st.warning("No titles match your global filters. Adjust filters in the sidebar.") | |
| st.stop() | |
| seed_title = st.sidebar.selectbox("Pick a seed title", options=sorted(f["title"].unique().tolist())) | |
| tab1, tab2, tab3, tab4 = st.tabs(["Recommend", "Explore Catalog", "Country → Genre", "Explain Method"]) | |
| # ------------------------- | |
| # Tab 1: Recommendations | |
| # ------------------------- | |
| with tab1: | |
| st.subheader("Recommendations (not affected by Country → Genre selection)") | |
| seed_row = df[df["title"] == seed_title].head(1) | |
| if not seed_row.empty: | |
| sr = seed_row.iloc[0] | |
| st.markdown( | |
| f"**Seed title:** `{sr['title']}` \n" | |
| f"- Type: **{sr.get('type','')}** | Year: **{sr.get('release_year','')}** | Rating: **{sr.get('rating','')}** \n" | |
| f"- Genres: **{sr.get('listed_in','')}** \n" | |
| f"- Country: **{sr.get('country','')}** \n" | |
| ) | |
| with st.expander("Show description"): | |
| st.write(sr.get("description", "")) | |
| recs = hybrid_recommend(df, sim, title_to_idx, seed_title, top_n=top_n, alpha=alpha, same_type=same_type) | |
| if recs.empty: | |
| st.info("No recommendations produced. Try another title.") | |
| else: | |
| c1, c2 = st.columns([1.25, 1]) | |
| with c1: | |
| st.dataframe(recs, use_container_width=True, height=420) | |
| with c2: | |
| fig = px.scatter( | |
| recs, | |
| x="content_similarity", | |
| y="popularity_norm", | |
| size="hybrid_score", | |
| hover_data=["title", "type", "release_year", "tier", "rating"], | |
| ) | |
| fig.update_layout( | |
| xaxis_title="Content similarity (cosine TF‑IDF)", | |
| yaxis_title="Popularity (normalized)", | |
| height=420 | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.caption("Tip: move alpha toward 1.0 for 'more similar', toward 0.0 for 'more popular' picks.") | |
| # ------------------------- | |
| # Tab 2: Explore catalog (GLOBAL FILTERS ONLY) | |
| # ------------------------- | |
| with tab2: | |
| st.subheader("Interactive catalog exploration (global filters only)") | |
| c1, c2 = st.columns([1, 1]) | |
| with c1: | |
| fig = px.histogram(f, x="popularity_score", color="tier", nbins=40, marginal="box") | |
| fig.update_layout(xaxis_title="Popularity score", yaxis_title="Count", height=420) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with c2: | |
| top = f.sort_values("popularity_score", ascending=False).head(20) | |
| fig2 = px.bar(top, x="title", y="popularity_score", color="tier") | |
| fig2.update_layout(xaxis_title="", yaxis_title="Popularity score", height=420) | |
| st.plotly_chart(fig2, use_container_width=True) | |
| st.dataframe( | |
| f[["title","type","release_year","rating","listed_in","country","popularity_score","tier"]].reset_index(drop=True), | |
| use_container_width=True, | |
| height=360 | |
| ) | |
| # ------------------------- | |
| # Tab 3: Country → Genre (COUNTRY SELECTION LIVES HERE ONLY) | |
| # ------------------------- | |
| with tab3: | |
| st.subheader("Country → Genre analysis (affects this tab only)") | |
| st.caption("Select a production country to see the most common genres and example titles from that market.") | |
| # Country selector is intentionally inside this tab so it does NOT affect recommendations. | |
| country_values = ( | |
| df["country"] | |
| .fillna("") | |
| .astype(str) | |
| .str.split(",") | |
| .explode() | |
| .str.strip() | |
| ) | |
| country_values = country_values[country_values != ""] | |
| all_countries = sorted(country_values.unique().tolist()) | |
| if not all_countries: | |
| st.warning("No country information found in the dataset.") | |
| else: | |
| colA, colB = st.columns([1, 1]) | |
| with colA: | |
| selected_country = st.selectbox("Select a country", options=all_countries) | |
| with colB: | |
| top_genres_n = st.slider("Top N genres", 5, 30, 10) | |
| # Filter titles that include the selected country | |
| mask = df["country"].fillna("").astype(str).str.contains(rf"(^|,\s*){re.escape(selected_country)}(\s*,|$)", regex=True) | |
| dcf = df[mask].copy() | |
| if dcf.empty: | |
| st.info("No titles found for this country.") | |
| else: | |
| # Genre counts | |
| genres = dcf["listed_in"].apply(split_genres).explode() | |
| genres = genres.dropna() | |
| genre_counts = genres.value_counts().head(top_genres_n).reset_index() | |
| genre_counts.columns = ["genre", "count"] | |
| fig = px.bar(genre_counts, x="genre", y="count") | |
| fig.update_layout(xaxis_title="Genre", yaxis_title="Count", height=420) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Example titles for each top genre (up to 5 per genre) | |
| examples = [] | |
| top_genres = genre_counts["genre"].tolist() | |
| for g in top_genres: | |
| ex = dcf[dcf["listed_in"].fillna("").astype(str).str.contains(re.escape(g), regex=True)] | |
| ex = ex.sort_values("popularity_score", ascending=False).head(5) | |
| for _, row in ex.iterrows(): | |
| examples.append({ | |
| "genre": g, | |
| "title": row.get("title", ""), | |
| "type": row.get("type", ""), | |
| "release_year": row.get("release_year", ""), | |
| "rating": row.get("rating", ""), | |
| "popularity_score": row.get("popularity_score", ""), | |
| }) | |
| exdf = pd.DataFrame(examples) | |
| st.markdown("**Example titles (top 5 by popularity within each genre)**") | |
| st.dataframe(exdf, use_container_width=True, height=360) | |
| # ------------------------- | |
| # Tab 4: Explain | |
| # ------------------------- | |
| with tab4: | |
| st.subheader("How recommendations are generated (workshop-aligned)") | |
| st.markdown( | |
| """ | |
| ### 1) Build a content representation | |
| We combine multiple metadata fields into one text document per title: | |
| - type (Movie / TV Show) | |
| - listed_in (genres) | |
| - country | |
| - rating | |
| - director, cast | |
| - description | |
| ### 2) TF‑IDF + cosine similarity | |
| We vectorize the combined text using TF‑IDF and compute pairwise cosine similarity between all titles. | |
| ### 3) Hybrid ranking | |
| We blend content similarity with a popularity prior: | |
| `hybrid_score = alpha * similarity + (1 - alpha) * popularity_norm` | |
| **Important:** the **Country → Genre** selection is an exploration tool and does **not** restrict the recommender. | |
| """ | |
| ) | |
| st.code( | |
| "hybrid_score = alpha * cosine_similarity(TFIDF(combined_text)) + (1 - alpha) * popularity_norm", | |
| language="python" | |
| ) | |