Cacami's picture
Rename app_country_tab_only.py to app.py
0c203cd verified
import os
import re
import numpy as np
import pandas as pd
import streamlit as st
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
st.set_page_config(page_title="RX12 - Netflix Hybrid Recommender", layout="wide")
# -------------------------
# Files expected in Space ROOT
# -------------------------
TITLES_PATH = "netflix_titles.csv"
POPULARITY_PATH = "synthetic_title_popularity.csv"
# -------------------------
# Helpers
# -------------------------
def clean_text(s: str) -> str:
s = "" if pd.isna(s) else str(s)
s = s.lower()
s = re.sub(r"[^a-z0-9\s]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
@st.cache_data(show_spinner=False)
def load_titles(path: str) -> pd.DataFrame:
df = pd.read_csv(path)
df.columns = [c.strip() for c in df.columns]
return df
@st.cache_data(show_spinner=False)
def load_popularity(path: str) -> pd.DataFrame:
df = pd.read_csv(path)
df.columns = [c.strip() for c in df.columns]
return df
def build_combined_text(df: pd.DataFrame) -> pd.Series:
"""
Combine rich metadata fields into a single 'bag of words' representation.
"""
fields = ["type", "director", "cast", "country", "rating", "listed_in", "description"]
tmp = df.copy()
for f in fields:
if f not in tmp.columns:
tmp[f] = ""
tmp[f] = tmp[f].fillna("").astype(str).map(clean_text)
combined = (
tmp["type"] + " " +
tmp["listed_in"] + " " +
tmp["country"] + " " +
tmp["rating"] + " " +
tmp["director"] + " " +
tmp["cast"] + " " +
tmp["description"]
).str.replace(r"\s+", " ", regex=True).str.strip()
return combined
@st.cache_data(show_spinner=False)
def build_tfidf_and_similarity(df: pd.DataFrame):
combined = build_combined_text(df)
vectorizer = TfidfVectorizer(
stop_words="english",
ngram_range=(1, 2),
min_df=2,
max_features=50000,
)
tfidf = vectorizer.fit_transform(combined)
sim = cosine_similarity(tfidf, tfidf)
title_to_idx = {t: i for i, t in enumerate(df["title"].tolist())}
return sim, title_to_idx
def demand_tier(pop_score: pd.Series) -> pd.Series:
q1, q2 = pop_score.quantile([0.33, 0.66]).tolist()
def tier(x):
if x <= q1: return "Low"
if x <= q2: return "Mid"
return "High"
return pop_score.apply(tier)
def hybrid_recommend(df: pd.DataFrame, sim: np.ndarray, title_to_idx: dict,
seed_title: str, top_n: int = 10, alpha: float = 0.7,
same_type: bool = True) -> pd.DataFrame:
"""
Hybrid score = alpha * content_similarity + (1 - alpha) * popularity_norm
NOTE: This recommendation is computed on the FULL dataset (df),
not restricted by the Country→Genre selection.
"""
if seed_title not in title_to_idx:
return pd.DataFrame()
idx = title_to_idx[seed_title]
sim_scores = sim[idx].copy()
sim_scores[idx] = -1 # drop itself
pop = df["popularity_norm"].astype(float).to_numpy()
hybrid = alpha * sim_scores + (1 - alpha) * pop
seed_type = df.loc[idx, "type"] if "type" in df.columns else None
candidate_idx = np.arange(len(df))
if same_type and seed_type is not None:
candidate_idx = candidate_idx[df["type"].to_numpy() == seed_type]
ranked = candidate_idx[np.argsort(hybrid[candidate_idx])[::-1]]
ranked = ranked[:top_n]
out = df.iloc[ranked].copy()
out["content_similarity"] = sim_scores[ranked]
out["hybrid_score"] = hybrid[ranked]
keep = [
"title", "type", "release_year", "rating", "listed_in", "country",
"popularity_score", "popularity_norm", "tier",
"content_similarity", "hybrid_score"
]
keep = [c for c in keep if c in out.columns]
return out[keep].sort_values("hybrid_score", ascending=False).reset_index(drop=True)
def split_genres(listed_in: str) -> list:
if pd.isna(listed_in) or not str(listed_in).strip():
return []
return [g.strip() for g in str(listed_in).split(",") if g.strip()]
# -------------------------
# Load + merge
# -------------------------
st.title("RX12 - Netflix Hybrid Recommender")
st.caption("Content-based recommendations (TF‑IDF over metadata & descriptions) blended with popularity priors.")
missing = [p for p in [TITLES_PATH, POPULARITY_PATH] if not os.path.exists(p)]
if missing:
st.error(
"Missing required file(s) in Space root directory: "
+ ", ".join(f"`{m}`" for m in missing)
+ ". Upload them under the **Files** tab (root level)."
)
st.stop()
titles = load_titles(TITLES_PATH)
pop = load_popularity(POPULARITY_PATH)
if "title" not in titles.columns:
st.error("`netflix_titles.csv` must contain a `title` column.")
st.stop()
if not {"title", "popularity_score", "popularity_norm"}.issubset(set(pop.columns)):
st.error("`synthetic_title_popularity.csv` must contain `title`, `popularity_score`, `popularity_norm`.")
st.stop()
df = titles.merge(pop, on="title", how="left")
df["popularity_score"] = pd.to_numeric(df["popularity_score"], errors="coerce")
df["popularity_norm"] = pd.to_numeric(df["popularity_norm"], errors="coerce")
df = df.dropna(subset=["popularity_score", "popularity_norm"]).reset_index(drop=True)
df["tier"] = demand_tier(df["popularity_score"])
sim, title_to_idx = build_tfidf_and_similarity(df)
# -------------------------
# Sidebar controls (GLOBAL)
# -------------------------
st.sidebar.header("Controls (Global)")
type_opt = st.sidebar.multiselect(
"Content type",
sorted(df["type"].dropna().unique().tolist()),
default=sorted(df["type"].dropna().unique().tolist())
)
year_min, year_max = int(df["release_year"].min()), int(df["release_year"].max())
year_range = st.sidebar.slider("Release year range", year_min, year_max, (max(year_min, year_max-20), year_max))
rating_opt = st.sidebar.multiselect("Rating", sorted(df["rating"].fillna("Unknown").unique().tolist()), default=[])
tier_opt = st.sidebar.multiselect("Demand tier", ["Low", "Mid", "High"], default=["Low","Mid","High"])
alpha = st.sidebar.slider("Hybrid weight (Similarity → Popularity)", 0.0, 1.0, 0.7, 0.05)
top_n = st.sidebar.slider("Top N recommendations", 5, 30, 10)
same_type = st.sidebar.checkbox("Recommend within same type", value=True)
# Apply global filters for browsing and for seed title selection
f = df.copy()
f = f[f["type"].isin(type_opt)]
f = f[(f["release_year"] >= year_range[0]) & (f["release_year"] <= year_range[1])]
f = f[f["tier"].isin(tier_opt)]
if rating_opt:
f = f[f["rating"].fillna("Unknown").isin(rating_opt)]
f = f.reset_index(drop=True)
if f.empty:
st.warning("No titles match your global filters. Adjust filters in the sidebar.")
st.stop()
seed_title = st.sidebar.selectbox("Pick a seed title", options=sorted(f["title"].unique().tolist()))
tab1, tab2, tab3, tab4 = st.tabs(["Recommend", "Explore Catalog", "Country → Genre", "Explain Method"])
# -------------------------
# Tab 1: Recommendations
# -------------------------
with tab1:
st.subheader("Recommendations (not affected by Country → Genre selection)")
seed_row = df[df["title"] == seed_title].head(1)
if not seed_row.empty:
sr = seed_row.iloc[0]
st.markdown(
f"**Seed title:** `{sr['title']}` \n"
f"- Type: **{sr.get('type','')}** | Year: **{sr.get('release_year','')}** | Rating: **{sr.get('rating','')}** \n"
f"- Genres: **{sr.get('listed_in','')}** \n"
f"- Country: **{sr.get('country','')}** \n"
)
with st.expander("Show description"):
st.write(sr.get("description", ""))
recs = hybrid_recommend(df, sim, title_to_idx, seed_title, top_n=top_n, alpha=alpha, same_type=same_type)
if recs.empty:
st.info("No recommendations produced. Try another title.")
else:
c1, c2 = st.columns([1.25, 1])
with c1:
st.dataframe(recs, use_container_width=True, height=420)
with c2:
fig = px.scatter(
recs,
x="content_similarity",
y="popularity_norm",
size="hybrid_score",
hover_data=["title", "type", "release_year", "tier", "rating"],
)
fig.update_layout(
xaxis_title="Content similarity (cosine TF‑IDF)",
yaxis_title="Popularity (normalized)",
height=420
)
st.plotly_chart(fig, use_container_width=True)
st.caption("Tip: move alpha toward 1.0 for 'more similar', toward 0.0 for 'more popular' picks.")
# -------------------------
# Tab 2: Explore catalog (GLOBAL FILTERS ONLY)
# -------------------------
with tab2:
st.subheader("Interactive catalog exploration (global filters only)")
c1, c2 = st.columns([1, 1])
with c1:
fig = px.histogram(f, x="popularity_score", color="tier", nbins=40, marginal="box")
fig.update_layout(xaxis_title="Popularity score", yaxis_title="Count", height=420)
st.plotly_chart(fig, use_container_width=True)
with c2:
top = f.sort_values("popularity_score", ascending=False).head(20)
fig2 = px.bar(top, x="title", y="popularity_score", color="tier")
fig2.update_layout(xaxis_title="", yaxis_title="Popularity score", height=420)
st.plotly_chart(fig2, use_container_width=True)
st.dataframe(
f[["title","type","release_year","rating","listed_in","country","popularity_score","tier"]].reset_index(drop=True),
use_container_width=True,
height=360
)
# -------------------------
# Tab 3: Country → Genre (COUNTRY SELECTION LIVES HERE ONLY)
# -------------------------
with tab3:
st.subheader("Country → Genre analysis (affects this tab only)")
st.caption("Select a production country to see the most common genres and example titles from that market.")
# Country selector is intentionally inside this tab so it does NOT affect recommendations.
country_values = (
df["country"]
.fillna("")
.astype(str)
.str.split(",")
.explode()
.str.strip()
)
country_values = country_values[country_values != ""]
all_countries = sorted(country_values.unique().tolist())
if not all_countries:
st.warning("No country information found in the dataset.")
else:
colA, colB = st.columns([1, 1])
with colA:
selected_country = st.selectbox("Select a country", options=all_countries)
with colB:
top_genres_n = st.slider("Top N genres", 5, 30, 10)
# Filter titles that include the selected country
mask = df["country"].fillna("").astype(str).str.contains(rf"(^|,\s*){re.escape(selected_country)}(\s*,|$)", regex=True)
dcf = df[mask].copy()
if dcf.empty:
st.info("No titles found for this country.")
else:
# Genre counts
genres = dcf["listed_in"].apply(split_genres).explode()
genres = genres.dropna()
genre_counts = genres.value_counts().head(top_genres_n).reset_index()
genre_counts.columns = ["genre", "count"]
fig = px.bar(genre_counts, x="genre", y="count")
fig.update_layout(xaxis_title="Genre", yaxis_title="Count", height=420)
st.plotly_chart(fig, use_container_width=True)
# Example titles for each top genre (up to 5 per genre)
examples = []
top_genres = genre_counts["genre"].tolist()
for g in top_genres:
ex = dcf[dcf["listed_in"].fillna("").astype(str).str.contains(re.escape(g), regex=True)]
ex = ex.sort_values("popularity_score", ascending=False).head(5)
for _, row in ex.iterrows():
examples.append({
"genre": g,
"title": row.get("title", ""),
"type": row.get("type", ""),
"release_year": row.get("release_year", ""),
"rating": row.get("rating", ""),
"popularity_score": row.get("popularity_score", ""),
})
exdf = pd.DataFrame(examples)
st.markdown("**Example titles (top 5 by popularity within each genre)**")
st.dataframe(exdf, use_container_width=True, height=360)
# -------------------------
# Tab 4: Explain
# -------------------------
with tab4:
st.subheader("How recommendations are generated (workshop-aligned)")
st.markdown(
"""
### 1) Build a content representation
We combine multiple metadata fields into one text document per title:
- type (Movie / TV Show)
- listed_in (genres)
- country
- rating
- director, cast
- description
### 2) TF‑IDF + cosine similarity
We vectorize the combined text using TF‑IDF and compute pairwise cosine similarity between all titles.
### 3) Hybrid ranking
We blend content similarity with a popularity prior:
`hybrid_score = alpha * similarity + (1 - alpha) * popularity_norm`
**Important:** the **Country → Genre** selection is an exploration tool and does **not** restrict the recommender.
"""
)
st.code(
"hybrid_score = alpha * cosine_similarity(TFIDF(combined_text)) + (1 - alpha) * popularity_norm",
language="python"
)