Spaces:

ralate2
/

Legsislation_Data_2019_2026

Sleeping

App Files Files Community

ralate2 commited on Feb 25

Commit

febe6ca

verified ·

1 Parent(s): 8939bec

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
Dockerfile +15 -0
README.md +12 -0
app.py +1716 -0
dockerignore +13 -0
features_standardized_11_renamed.parquet +3 -0
requirements.txt +8 -0
utils.py +188 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ features_standardized_11_renamed.parquet filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends curl \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["sh", "-c", "export STREAMLIT_SERVER_PORT='' && PORT_TO_USE=${PORT:-7860} && echo PORT_TO_USE=$PORT_TO_USE && streamlit run app.py --server.address=0.0.0.0 --server.port=$PORT_TO_USE --server.headless=true --server.enableCORS=false --server.enableXsrfProtection=false --server.runOnSave=false --server.fileWatcherType=none"]

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Legislation Dashboard
+emoji: 📈
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_port: 7860
+---
+# Legislative Trends Dashboard
+Upload your parquet or CSV file to visualize legislative trends.

app.py ADDED Viewed

	@@ -0,0 +1,1716 @@

+import os
+import re
+import numpy as np
+import pandas as pd
+import streamlit as st
+import plotly.express as px
+import plotly.graph_objects as go
+from scipy import stats
+# Optional (legacy TF-IDF import kept harmlessly)
+try:
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    _HAS_SK = True
+except Exception:
+    _HAS_SK = False
+# -----------------------------
+# Page config
+# -----------------------------
+st.set_page_config(
+    page_title="Legislative Trends Dashboard",
+    layout="wide",
+    initial_sidebar_state="collapsed",
+)
+# -----------------------------
+# Palette
+# -----------------------------
+C_TRAPPED_DARKNESS = "#0F1F38"
+C_CEDAR_PLANK = "#8E7970"
+C_PUMPING_SPICE = "#F55449"   # negative
+C_LAZURITE_BLUE = "#1B4B5A"   # positive
+C_POSITIVE = C_LAZURITE_BLUE
+C_NEGATIVE = C_PUMPING_SPICE
+C_STABLE = C_CEDAR_PLANK
+PLOTLY_TEMPLATE = "plotly_white"
+DEFAULT_CANDIDATES = [
+    "features_standardized_11_renamed.parquet",
+]
+# Full data range (all years available for baseline)
+DATA_START_FULL = pd.to_datetime("2019-01-09").date()
+DATA_END_FULL   = pd.to_datetime("2026-02-06").date()
+# Display/filter range
+DATA_START = pd.to_datetime("2019-01-09").date()
+DATA_END   = pd.to_datetime("2026-02-06").date()
+_SPLIT_RE = re.compile(r"[,\|;/\n\t]+")
+STOPWORDS = {
+    "bill", "bills", "act", "acts", "amend", "amends", "amended", "amendment", "amendments",
+    "illinois", "state", "code", "section", "sections", "law", "laws", "new", "provide", "provides",
+    "making", "make", "made", "relating", "regarding", "including", "include", "includes", "within",
+    "existing", "technical", "resolution", "resolutions", "effective", "date", "public",
+    "department", "agency", "program", "programs", "general", "shall", "may", "must", "also",
+    "one", "two", "three", "per", "use", "used", "using", "would", "could", "can", "like",
+    "not", "no", "yes", "etc", "among", "upon", "require", "requires", "required", "requirement",
+    "establish", "establishes", "established", "create", "creates", "created", "implementation",
+    "board", "boards", "commission", "commissions", "report", "reports", "reporting",
+    "information", "data", "system", "systems", "process", "processes", "administration",
+    "student", "students", "education", "educational", "school", "schools",
+    "support", "and", "the", "for", "with", "that", "this", "from", "have", "has", "had",
+    "be", "been", "being", "are", "is", "was", "were", "will", "would", "should", "could",
+    "may", "might", "must", "can", "shall", "need", "needs", "needed", "such", "other",
+    "any", "all", "each", "some", "more", "most", "than", "into", "through", "between",
+    "under", "over", "about", "against", "during", "after", "before", "above", "below",
+    "up", "down", "in", "out", "on", "off", "to", "at", "by", "of", "as", "or", "but", "if",
+    "when", "where", "why", "how", "which", "who", "whom", "whose", "what", "whether",
+    "there", "their", "they", "them", "these", "those", "then", "than", "only", "just",
+    "both", "either", "neither", "nor", "so", "too", "very", "even", "also", "however",
+    "therefore", "thus", "hence", "accordingly", "consequently", "furthermore", "moreover",
+    "nevertheless", "nonetheless", "otherwise", "rather", "instead", "yet", "still",
+    "already", "always", "never", "ever", "often", "sometimes", "usually", "generally",
+    "specifically", "particularly", "especially", "mainly", "mostly", "largely",
+    "context", "establishment", "legislative", "promoting", "justice", "human", "rights", "protections"
+}
+GENERIC_PHRASES = {
+    "effective date", "public act", "existing law", "state code", "general assembly",
+    "relating to", "regarding", "provide that", "provides that", "amend the", "amends the",
+    "this act", "the act", "state agency", "support and", "and context", "context establishment",
+    "legislative support", "promoting justice", "justice and", "and human", "human rights",
+    "rights protections", "and human rights", "justice and human", "human rights protections",
+    "support and context", "and context establishment", "legislative support and"
+}
+TFIDF_BLOCK_WORDS = {
+    "likely", "promote", "promotes", "promoting", "desire", "desires",
+    "aim", "aims", "without", "specific", "etc", "mentions", "mention",
+    "mentioned", "provided", "provides", "appears", "suggests", "suggest",
+    "driven", "purpose", "express", "referred", "uses", "use", "introduce",
+    "introduced", "unclear", "behind", "text", "motivation", "intent", "strategy"
+}
+TFIDF_BLOCK_PHRASES = {
+    "does specific", "provided text", "mentioned provided text", "appears procedural"
+}
+# -----------------------------
+# CSS
+# -----------------------------
+st.markdown(
+    f"""
+<style>
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap');
+.block-container {{
+  padding-top: 1.2rem !important;
+  padding-bottom: 0.35rem !important;
+  padding-left: 0.8rem !important;
+  padding-right: 0.8rem !important;
+}}
+/* Hide Streamlit top decoration bar that clips content */
+header[data-testid="stHeader"] {{
+  background: transparent !important;
+  height: 0rem !important;
+}}
+[data-testid="stToolbar"] {{
+  display: none !important;
+}}
+.main {{ background-color: #EEF2F3; }}
+html, body, [class*="css"]  {{ font-family: 'Inter', sans-serif; }}
+.header-wrap {{
+  background: linear-gradient(90deg, {C_TRAPPED_DARKNESS} 0%, {C_LAZURITE_BLUE} 60%, {C_CEDAR_PLANK} 100%);
+  padding: 10px 14px;
+  border-radius: 12px;
+  margin: 6px 0 10px 0;
+  box-shadow: 0 2px 8px rgba(0,0,0,0.10);
+}}
+.header-title {{
+  color: #ffffff;
+  font-weight: 800;
+  font-size: 20px;
+  margin: 0;
+  line-height: 1.1;
+}}
+.header-sub {{
+  color: rgba(255,255,255,0.88);
+  font-size: 12px;
+  margin-top: 2px;
+  line-height: 1.2;
+}}
+.kpi-grid {{
+  display: grid;
+  grid-template-columns: 1.0fr 1.0fr 1.0fr 1.0fr 1.0fr;
+  gap: 10px;
+  margin-bottom: 10px;
+}}
+.kpi-card {{
+  background: #ffffff;
+  border: 1px solid #D6DEE0;
+  border-radius: 12px;
+  padding: 10px 12px;
+  box-shadow: 0 1px 6px rgba(0,0,0,0.06);
+}}
+.kpi-label {{
+  font-size: 11.5px;
+  font-weight: 650;
+  color: #5b6b71;
+  margin-bottom: 6px;
+  text-transform: uppercase;
+  letter-spacing: 0.2px;
+}}
+.kpi-value {{
+  font-size: 24px;
+  font-weight: 800;
+  color: {C_TRAPPED_DARKNESS};
+  line-height: 1.05;
+}}
+.filter-row {{
+  background:#ffffff;
+  border: 1px solid #D6DEE0;
+  border-radius: 12px;
+  padding: 8px 10px;
+  box-shadow: 0 1px 6px rgba(0,0,0,0.08);
+  margin-bottom: 10px;
+}}
+div[data-testid="stVerticalBlock"] > div {{ gap: 0.35rem; }}
+</style>
+""",
+    unsafe_allow_html=True,
+)
+# -----------------------------
+# Helpers
+# -----------------------------
+def _find_first_existing(paths):
+    for p in paths:
+        if os.path.exists(p):
+            return p
+    return None
+def load_dataset(path: str) -> pd.DataFrame:
+    if path.lower().endswith(".parquet"):
+        return pd.read_parquet(path)
+    if path.lower().endswith(".csv"):
+        return pd.read_csv(path)
+    raise ValueError("Unsupported file type")
+def ensure_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame:
+    df = df.copy()
+    df[col] = pd.to_datetime(df[col], errors="coerce")
+    return df.dropna(subset=[col])
+def add_time_grains(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
+    df = df.copy()
+    d = df[date_col]
+    df["month"] = d.dt.to_period("M").astype(str)
+    iso = d.dt.isocalendar()
+    df["week"] = iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)
+    df["calendar_month"] = d.dt.month          # 1-12 for seasonal baseline
+    df["year"] = d.dt.year
+    return df
+def pct(n, d):
+    return 0.0 if d == 0 else round((n / d) * 100.0, 1)
+def _split_listlike(x):
+    if pd.isna(x):
+        return []
+    if isinstance(x, list):
+        parts = [str(i) for i in x]
+    elif isinstance(x, str):
+        parts = [p.strip() for p in _SPLIT_RE.split(x) if p.strip()]
+    else:
+        parts = [str(x).strip()]
+    return [p for p in parts if p]
+def safe_col(df, col):
+    return col in df.columns and df[col].notna().any()
+def tight_layout(fig, height=360):
+    fig.update_layout(
+        template=PLOTLY_TEMPLATE,
+        height=height,
+        margin=dict(l=8, r=8, t=8, b=8),
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+    )
+    fig.update_xaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
+    fig.update_yaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
+    return fig
+def build_full_period_order(start_date, end_date, grain: str):
+    start = pd.to_datetime(start_date)
+    end = pd.to_datetime(end_date)
+    if grain == "month":
+        return pd.period_range(start=start, end=end, freq="M").astype(str).tolist()
+    weeks = pd.date_range(start=start, end=end, freq="W-MON")
+    if len(weeks) == 0:
+        weeks = pd.date_range(start=start, end=end, freq="D")[:1]
+    iso = weeks.isocalendar()
+    return (iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)).tolist()
+def explode_terms(df: pd.DataFrame, col: str, stopwords=None, min_len=3):
+    if col not in df.columns:
+        return pd.DataFrame()
+    stopwords = stopwords or set()
+    tmp = df.copy()
+    tmp["_term"] = tmp[col].apply(_split_listlike)
+    tmp = tmp.explode("_term").dropna(subset=["_term"])
+    term = tmp["_term"].astype(str).str.strip().str.lower()
+    term = term.str.replace(r"[^a-z0-9\s\-]", "", regex=True).str.strip()
+    tmp["term"] = term
+    tmp = tmp[tmp["term"].str.len() >= min_len]
+    tmp = tmp[~tmp["term"].isin(stopwords)]
+    tmp["mentions"] = 1
+    return tmp.drop(columns=["_term"], errors="ignore")
+# ---------- TF-IDF (contrastive) ----------
+def _clean_text_for_tfidf(t: str) -> str:
+    t = str(t).lower()
+    t = re.sub(r'http\S+|www\.\S+', '', t)
+    t = re.sub(r"[^a-z0-9\s\-]", " ", t)
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+def _term_is_bad(term: str) -> bool:
+    term = term.strip().lower()
+    if not term:
+        return True
+    if term in GENERIC_PHRASES:
+        return True
+    if term in TFIDF_BLOCK_PHRASES:
+        return True
+    toks = term.split()
+    if len(toks) < 2 or len(toks) > 3:
+        return True
+    if any(w in TFIDF_BLOCK_WORDS for w in toks):
+        return True
+    if all((w in STOPWORDS or len(w) < 3) for w in toks):
+        return True
+    generic_words = {"relating", "regarding", "provide", "provides", "amend", "amends", "section", "subsection"}
+    if any(w in generic_words for w in toks):
+        return True
+    stopword_count = sum(1 for w in toks if w in STOPWORDS)
+    if len(toks) > 1 and stopword_count / len(toks) > 0.5:
+        return True
+    if len(toks) == 2 and any(w in ["state", "bill", "act", "law"] for w in toks):
+        return True
+    return False
+def _bill_docs(df_slice: pd.DataFrame, bill_id_col: str, text_col: str):
+    tmp = df_slice[[bill_id_col, text_col]].dropna().copy()
+    tmp[text_col] = tmp[text_col].astype(str).map(_clean_text_for_tfidf)
+    tmp = tmp[tmp[text_col].str.len() > 15]
+    if tmp.empty:
+        return []
+    docs = tmp.groupby(bill_id_col)[text_col].apply(lambda s: " ".join(s.tolist())).tolist()
+    docs = [d for d in docs if d.strip()]
+    return docs
+def build_contrastive_tfidf(df_cat: pd.DataFrame, df_rest: pd.DataFrame, bill_id_col: str, text_col: str, top_k=15):
+    if not _HAS_SK:
+        return []
+    docs_cat = _bill_docs(df_cat, bill_id_col, text_col)
+    docs_rest = _bill_docs(df_rest, bill_id_col, text_col)
+    if len(docs_cat) < 2 or len(docs_rest) < 2:
+        return []
+    vec = TfidfVectorizer(
+        stop_words=list(STOPWORDS),
+        ngram_range=(2, 3),
+        min_df=2,
+        max_df=0.35,
+        sublinear_tf=True,
+        norm="l2",
+        token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z\-]{3,}\b",
+        max_features=2000
+    )
+    all_docs = docs_cat + docs_rest
+    try:
+        X = vec.fit_transform(all_docs)
+    except Exception:
+        return []
+    terms = np.array(vec.get_feature_names_out())
+    if len(terms) == 0:
+        return []
+    X_cat = X[:len(docs_cat)]
+    X_rest = X[len(docs_cat):]
+    mean_cat = np.asarray(X_cat.mean(axis=0)).ravel()
+    mean_rest = np.asarray(X_rest.mean(axis=0)).ravel()
+    contrast = mean_cat - mean_rest
+    idx = np.argsort(contrast)[::-1]
+    out = []
+    seen_sets = []
+    min_contrast = 0.01
+    for i in idx:
+        if len(out) >= top_k:
+            break
+        if contrast[i] <= min_contrast:
+            break
+        t = terms[i]
+        if _term_is_bad(t):
+            continue
+        wset = set(t.split())
+        redundant = False
+        for s in seen_sets:
+            if len(wset & s) >= max(2, len(wset) - 1):
+                redundant = True
+                break
+        if redundant:
+            continue
+        seen_sets.append(wset)
+        out.append((t, round(float(contrast[i]), 4)))
+    return out
+# ---------- Direction of change ----------
+def classify_direction(sdf: pd.DataFrame, period_col: str, period_order: list, bill_col: str):
+    if sdf.empty:
+        return ("Stable", 0.0)
+    ts = (sdf.groupby(period_col)[bill_col].nunique()
+          .reindex(period_order, fill_value=0)
+          .to_numpy(dtype=float))
+    if ts.sum() == 0 or len(ts) < 2:
+        return ("Stable", 0.0)
+    x = np.arange(len(ts), dtype=float)
+    slope = float(np.polyfit(x, ts, 1)[0])
+    eps = 0.10
+    if slope > eps:
+        return ("Rising", slope)
+    if slope < -eps:
+        return ("Declining", slope)
+    return ("Stable", slope)
+def short_list(items, n=3):
+    items = [x for x in items if x]
+    if not items:
+        return "-"
+    return ", ".join(items[:n]) + ("..." if len(items) > n else "")
+# =====================================================
+# STEP 1-8: Category Share Baseline & Z-Score Engine
+# =====================================================
+def compute_monthly_share_series(df_all: pd.DataFrame, category: str,
+                                  cat_col: str, bill_id_col: str, date_col: str) -> pd.DataFrame:
+    """
+    STEP 1: Compute monthly category share = bills_in_cat / total_bills for every month.
+    Returns DataFrame with columns: [period_month, year, calendar_month, share, total_bills, cat_bills]
+    Uses full dataset across all years for baseline computation.
+    """
+    df_all = df_all.copy()
+    df_all["_ym"] = df_all[date_col].dt.to_period("M")
+    df_all["_year"] = df_all[date_col].dt.year
+    df_all["_cal_month"] = df_all[date_col].dt.month
+    total_by_month = (
+        df_all.groupby("_ym")[bill_id_col].nunique()
+        .reset_index(name="total_bills")
+    )
+    cat_df = df_all[df_all[cat_col].astype(str) == str(category)]
+    cat_by_month = (
+        cat_df.groupby("_ym")[bill_id_col].nunique()
+        .reset_index(name="cat_bills")
+    )
+    merged = pd.merge(total_by_month, cat_by_month, on="_ym", how="left").fillna(0)
+    merged["share"] = (merged["cat_bills"] / merged["total_bills"] * 100.0).replace([np.inf, -np.inf], 0).fillna(0)
+    merged["year"] = merged["_ym"].dt.year
+    merged["calendar_month"] = merged["_ym"].dt.month
+    merged["period_str"] = merged["_ym"].astype(str)
+    return merged.sort_values("_ym").reset_index(drop=True)
+def compute_seasonal_baseline(share_series: pd.DataFrame, lookback_years: int = 5) -> pd.DataFrame:
+    """
+    STEP 2: For each calendar month, compute 5-year mean and std of share.
+    Returns a dict-like DataFrame keyed by calendar_month.
+    """
+    latest_year = share_series["year"].max()
+    cutoff_year = latest_year - lookback_years
+    historical = share_series[share_series["year"] > cutoff_year].copy()
+    baseline = (
+        historical.groupby("calendar_month")["share"]
+        .agg(mean_share="mean", std_share="std")
+        .reset_index()
+    )
+    baseline["std_share"] = baseline["std_share"].fillna(0.0)
+    return baseline
+def compute_regression_on_share(share_series: pd.DataFrame,
+                                  total_monthly: pd.DataFrame) -> dict:
+    """
+    STEP 4 & 5: Linear regression on category share and total bills over time.
+    Returns regression stats dict.
+    """
+    y = share_series["share"].values
+    x = np.arange(len(y), dtype=float)
+    result = {"cat_slope": 0.0, "cat_pvalue": 1.0, "cat_intercept": 0.0,
+              "total_slope": 0.0, "significant_growth": False}
+    if len(y) < 3:
+        return result
+    try:
+        slope, intercept, r, p, se = stats.linregress(x, y)
+        result["cat_slope"] = float(slope)
+        result["cat_intercept"] = float(intercept)
+        result["cat_pvalue"] = float(p)
+    except Exception:
+        pass
+    # Regression on total bills
+    if total_monthly is not None and len(total_monthly) >= 3:
+        try:
+            yt = total_monthly["total_bills"].values.astype(float)
+            xt = np.arange(len(yt), dtype=float)
+            t_slope, *_ = stats.linregress(xt, yt)
+            result["total_slope"] = float(t_slope)
+        except Exception:
+            pass
+    # STEP 5: significant growth if p<0.05 AND cat slope > total slope
+    result["significant_growth"] = (
+        result["cat_pvalue"] < 0.05 and result["cat_slope"] > result["total_slope"]
+    )
+    return result
+def compute_zscore_and_residuals(share_series: pd.DataFrame,
+                                  baseline: pd.DataFrame,
+                                  reg_stats: dict) -> pd.DataFrame:
+    """
+    STEP 3 & 6 & 7: Compute Z-scores, regression predicted values, detrended residuals,
+    and anomaly flags (±2 SD).
+    """
+    df = share_series.merge(baseline, on="calendar_month", how="left")
+    # STEP 3: Z-score
+    df["z_score"] = np.where(
+        df["std_share"] > 0,
+        (df["share"] - df["mean_share"]) / df["std_share"],
+        0.0
+    )
+    # STEP 6: Regression predicted value and residual
+    x = np.arange(len(df), dtype=float)
+    df["predicted_share"] = reg_stats["cat_intercept"] + reg_stats["cat_slope"] * x
+    df["residual"] = df["share"] - df["predicted_share"]
+    # Seasonal residual std (for band)
+    res_std = df["residual"].std() if len(df) > 2 else 0.0
+    df["upper_2sd"] = df["predicted_share"] + 2 * res_std
+    df["lower_2sd"] = df["predicted_share"] - 2 * res_std
+    # STEP 7: Anomaly flag using seasonal baseline ±2SD
+    df["upper_thresh"] = df["mean_share"] + 2 * df["std_share"]
+    df["lower_thresh"] = df["mean_share"] - 2 * df["std_share"]
+    df["anomaly"] = np.where(
+        df["share"] > df["upper_thresh"], "High",
+        np.where(df["share"] < df["lower_thresh"], "Low", "Normal")
+    )
+    return df
+# =====================================================
+# STEP 9: Subcategory Momentum
+# - Short-term: iterative % change over filtered period
+# - Long-term: 5-year monthly regression slope from df_full
+# =====================================================
+def compute_subcategory_momentum(df_cat_filtered: pd.DataFrame,
+                                  df_full_cat: pd.DataFrame,
+                                  cat_sub_col: str,
+                                  bill_id_col: str,
+                                  period_col: str,
+                                  period_order: list) -> pd.DataFrame:
+    """
+    STEP 9:
+    - Iterative percent change over the user-selected filtered period (short-term momentum).
+    - 5-year monthly regression slope computed from df_full (long-term momentum strength).
+    The 5-yr slope is used as the primary bar value (momentum strength).
+    Avg % change shown in hover as the short-term signal.
+    """
+    if not safe_col(df_cat_filtered, cat_sub_col):
+        return pd.DataFrame()
+    # --- Short-term: iterative pct change over filtered period ---
+    df_m = df_cat_filtered.copy()
+    df_m[cat_sub_col] = df_m[cat_sub_col].astype(str).str.strip()
+    sub_period = (
+        df_m.dropna(subset=[cat_sub_col])
+            .groupby([period_col, cat_sub_col])[bill_id_col].nunique()
+            .reset_index(name="bills")
+    )
+    subs = sorted(df_m[cat_sub_col].dropna().unique().tolist())
+    if len(period_order) < 2 or not subs or sub_period.empty:
+        return pd.DataFrame()
+    panel_short = (
+        sub_period.pivot_table(index=cat_sub_col, columns=period_col, values="bills", aggfunc="sum")
+                  .reindex(index=subs, columns=period_order, fill_value=0)
+    )
+    short_term = {}
+    for sub in panel_short.index:
+        y = panel_short.loc[sub].to_numpy(dtype=float)
+        pct_changes = []
+        for i in range(1, len(y)):
+            prev, curr = y[i - 1], y[i]
+            pct_changes.append((curr - prev) / prev * 100.0 if prev > 0 else 0.0)
+        short_term[sub] = float(np.mean(pct_changes)) if pct_changes else 0.0
+    # --- Long-term: 5-year monthly regression slope from full dataset ---
+    long_term = {}
+    if df_full_cat is not None and not df_full_cat.empty and cat_sub_col in df_full_cat.columns:
+        df_fl = df_full_cat.copy()
+        df_fl[cat_sub_col] = df_fl[cat_sub_col].astype(str).str.strip()
+        df_fl["_ym"] = df_fl["status_date"].dt.to_period("M")
+        full_sub_monthly = (
+            df_fl.dropna(subset=[cat_sub_col])
+                 .groupby(["_ym", cat_sub_col])[bill_id_col].nunique()
+                 .reset_index(name="bills")
+        )
+        for sub in subs:
+            sub_ts = (
+                full_sub_monthly[full_sub_monthly[cat_sub_col] == sub]
+                .sort_values("_ym")
+            )
+            y_full = sub_ts["bills"].to_numpy(dtype=float)
+            if len(y_full) >= 3:
+                x_full = np.arange(len(y_full), dtype=float)
+                try:
+                    slope_5yr = float(np.polyfit(x_full, y_full, 1)[0])
+                except Exception:
+                    slope_5yr = 0.0
+            else:
+                slope_5yr = 0.0
+            long_term[sub] = slope_5yr
+    mom_rows = []
+    for sub in subs:
+        slope_5yr   = long_term.get(sub, 0.0)
+        avg_pct_chg = short_term.get(sub, 0.0)
+        mom_rows.append((sub, slope_5yr, avg_pct_chg))
+    return (
+        pd.DataFrame(mom_rows, columns=["Subcategory", "Slope", "AvgPctChange"])
+        .sort_values("Slope", ascending=True)
+        .reset_index(drop=True)
+    )
+# =====================================================
+# STEP 8: Category Share chart with baseline band
+# =====================================================
+def plot_category_share_with_baseline(analysis_df: pd.DataFrame,
+                                       period_order_filter: list,
+                                       significant_growth: bool) -> go.Figure:
+    """
+    STEP 8: Plot observed share, regression baseline, ±2SD band,
+    and anomaly dots (red=high, blue=low).
+    Filtered to the user-selected date range periods.
+    """
+    plot_df = analysis_df[analysis_df["period_str"].isin(period_order_filter)].copy()
+    if plot_df.empty:
+        return go.Figure()
+    fig = go.Figure()
+    # Shaded ±2 SD band (regression-based)
+    fig.add_trace(go.Scatter(
+        x=list(plot_df["period_str"]) + list(plot_df["period_str"])[::-1],
+        y=list(plot_df["upper_2sd"]) + list(plot_df["lower_2sd"])[::-1],
+        fill="toself",
+        fillcolor="rgba(27,75,90,0.10)",
+        line=dict(color="rgba(255,255,255,0)"),
+        hoverinfo="skip",
+        name="±2 SD Band",
+        showlegend=True,
+    ))
+    # Seasonal mean baseline (dashed)
+    fig.add_trace(go.Scatter(
+        x=plot_df["period_str"],
+        y=plot_df["mean_share"],
+        mode="lines",
+        name="Seasonal Baseline (5yr mean)",
+        line=dict(color="#90B4BE", dash="dash", width=2),
+        hovertemplate="Seasonal Mean: %{y:.2f}%<extra></extra>",
+    ))
+    # Regression predicted line
+    fig.add_trace(go.Scatter(
+        x=plot_df["period_str"],
+        y=plot_df["predicted_share"],
+        mode="lines",
+        name="Regression Trend",
+        line=dict(color=C_CEDAR_PLANK, dash="dot", width=1.5),
+        hovertemplate="Predicted: %{y:.2f}%<extra></extra>",
+    ))
+    # Observed share (solid dark blue)
+    fig.add_trace(go.Scatter(
+        x=plot_df["period_str"],
+        y=plot_df["share"],
+        mode="lines+markers",
+        name="Observed Share",
+        line=dict(color=C_TRAPPED_DARKNESS, width=3),
+        marker=dict(color=C_TRAPPED_DARKNESS, size=6),
+        hovertemplate="<b>%{x}</b><br>Share: %{y:.2f}%<extra></extra>",
+    ))
+    # Anomaly dots — High = red, Low = blue
+    high_anom = plot_df[plot_df["anomaly"] == "High"]
+    low_anom  = plot_df[plot_df["anomaly"] == "Low"]
+    if not high_anom.empty:
+        fig.add_trace(go.Scatter(
+            x=high_anom["period_str"],
+            y=high_anom["share"],
+            mode="markers",
+            name="High Anomaly (>+2 SD)",
+            marker=dict(color=C_PUMPING_SPICE, size=12, symbol="circle",
+                        line=dict(color="white", width=1.5)),
+            hovertemplate="<b>%{x}</b><br>HIGH anomaly: %{y:.2f}%<br>Z: " +
+                          high_anom["z_score"].round(2).astype(str) + "<extra></extra>",
+        ))
+    if not low_anom.empty:
+        fig.add_trace(go.Scatter(
+            x=low_anom["period_str"],
+            y=low_anom["share"],
+            mode="markers",
+            name="Low Anomaly (<-2 SD)",
+            marker=dict(color=C_LAZURITE_BLUE, size=12, symbol="circle",
+                        line=dict(color="white", width=1.5)),
+            hovertemplate="<b>%{x}</b><br>LOW anomaly: %{y:.2f}%<br>Z: " +
+                          low_anom["z_score"].round(2).astype(str) + "<extra></extra>",
+        ))
+    title_suffix = " ★ Significant Structural Growth" if significant_growth else ""
+    fig.update_layout(
+        template=PLOTLY_TEMPLATE,
+        height=420,
+        margin=dict(l=8, r=8, t=28, b=8),
+        hovermode="x unified",
+        yaxis_title="Share (%)",
+        xaxis_title="",
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        legend=dict(orientation="h", yanchor="bottom", y=-0.30, xanchor="center", x=0.5, font=dict(size=10)),
+        title=dict(text=title_suffix, font=dict(size=11, color=C_PUMPING_SPICE), x=0.5) if title_suffix else {},
+    )
+    fig.update_xaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
+    fig.update_yaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0",
+                     range=[0, max(5, float(plot_df["share"].max()) * 1.25)])
+    return fig
+# -----------------------------
+# Load data
+# -----------------------------
+default_path = _find_first_existing(DEFAULT_CANDIDATES)
+if default_path is None:
+    uploaded = st.file_uploader("Upload Dataset", type=["parquet", "csv"])
+else:
+    uploaded = None
+data_path = None
+data_sig = "default"
+if uploaded:
+    tmp_path = f"/tmp/{uploaded.name}"
+    with open(tmp_path, "wb") as f:
+        f.write(uploaded.getbuffer())
+    data_path = tmp_path
+    data_sig = f"{uploaded.name}-{uploaded.size}"
+elif default_path:
+    data_path = default_path
+    try:
+        data_sig = f"{default_path}-{os.path.getmtime(default_path)}"
+    except Exception:
+        data_sig = default_path
+@st.cache_data(show_spinner=True)
+def _load_cached(path: str, signature: str) -> pd.DataFrame:
+    return load_dataset(path)
+df_raw = _load_cached(data_path, data_sig) if data_path else None
+if df_raw is None or df_raw.empty:
+    st.warning("Upload a dataset to begin.")
+    st.stop()
+# -----------------------------
+# Columns
+# -----------------------------
+DATE_COL        = "status_date"
+BILL_ID_COL     = "bill_id"
+CHAMBER_COL     = "chamber"
+CAT_MAIN        = "category_main_label"
+CAT_SUB         = "category_sub_label"
+INC_COL         = "increasing_aspects_standardized"
+DEC_COL         = "decreasing_aspects_standardized"
+BENEF_COL       = "intended_beneficiaries_standardized"
+IMPACT_SCORE_COL = "impact_rating_score"
+KW_SOURCES = {
+    "Motivation":           "motivation_raw",
+    "Intent":               "intent_raw",
+    "Legislative Strategy": "legislative_strategy_raw",
+}
+required = [DATE_COL, BILL_ID_COL, CAT_MAIN]
+missing = [c for c in required if c not in df_raw.columns]
+if missing:
+    st.error(f"Missing required columns: {missing}")
+    st.stop()
+# -----------------------------
+# Engineer chamber from bill_number
+# e.g. "HB1234" -> "HB", "SB5678" -> "SB"
+# -----------------------------
+BILL_NUM_COL = "bill_number"
+df_raw = df_raw.copy()
+if CHAMBER_COL not in df_raw.columns or df_raw[CHAMBER_COL].isna().all() or (df_raw[CHAMBER_COL].astype(str).str.strip() == "").all():
+    if BILL_NUM_COL in df_raw.columns:
+        df_raw[CHAMBER_COL] = (
+            df_raw[BILL_NUM_COL]
+            .astype(str)
+            .str.strip()
+            .str.extract(r"^([A-Za-z]+)", expand=False)
+            .str.upper()
+            .str[:2]
+        )
+    else:
+        df_raw[CHAMBER_COL] = "Unknown"
+# Full dataset for baseline (all years) — use pd.Timestamp for reliable datetime comparison
+df_full = ensure_datetime(df_raw, DATE_COL)
+df_full = add_time_grains(df_full, DATE_COL)
+df_full = df_full[
+    (df_full[DATE_COL] >= pd.Timestamp(DATA_START_FULL)) &
+    (df_full[DATE_COL] <= pd.Timestamp(DATA_END_FULL))
+].copy()
+# Filtered display dataset — strictly clamped to display range
+df = df_full[
+    (df_full[DATE_COL] >= pd.Timestamp(DATA_START)) &
+    (df_full[DATE_COL] <= pd.Timestamp(DATA_END))
+].copy()
+df = df.dropna(subset=[CAT_MAIN]).copy()
+if df.empty:
+    st.warning("No data in the display range (2025-01-08 to 2026-02-06).")
+    st.stop()
+# -----------------------------
+# Header
+# -----------------------------
+st.markdown(
+    """
+<div class="header-wrap">
+  <div class="header-title">Legislative Trends Dashboard</div>
+  <div class="header-sub">Category share • Subcategory drivers • Policy direction • Subcategory momentum • Beneficiary × chamber distribution</div>
+</div>
+""",
+    unsafe_allow_html=True,
+)
+# -----------------------------
+# Filters
+# -----------------------------
+if "clear_filters" not in st.session_state:
+    st.session_state.clear_filters = 0
+min_date = DATA_START
+max_date = DATA_END
+cats_all = sorted(df[CAT_MAIN].dropna().astype(str).unique().tolist())
+st.markdown("<div class='filter-row'>", unsafe_allow_html=True)
+f1, f2, f3, f4, f5, f6 = st.columns([1.6, 0.9, 1.2, 2.4, 1.3, 0.7])
+with f1:
+    date_range = st.date_input(
+        "Date Range",
+        value=(min_date, max_date),
+        min_value=min_date,
+        max_value=max_date,
+        key=f"date_{st.session_state.clear_filters}",
+    )
+if isinstance(date_range, tuple) and len(date_range) == 2:
+    start_date, end_date = date_range
+else:
+    start_date = date_range
+    end_date = date_range
+with f2:
+    time_grain = st.radio(
+        "Time Grain",
+        ["month", "week"],
+        horizontal=True,
+        key=f"tg_{st.session_state.clear_filters}",
+    )
+with f3:
+    chambers_all = ["All"] + sorted(df[CHAMBER_COL].dropna().astype(str).unique().tolist())
+    chambers = st.multiselect("Chamber", chambers_all, default=["All"], key=f"ch_{st.session_state.clear_filters}")
+with f4:
+    selected_cat = st.selectbox(
+        "Category",
+        cats_all,
+        index=0 if cats_all else 0,
+        key=f"cat_{st.session_state.clear_filters}",
+    )
+with f5:
+    sub_time = st.selectbox(
+        "Subcategory Window",
+        ["Overall", "Last 30 days", "Last 60 days"],
+        key=f"subwin_{st.session_state.clear_filters}",
+    )
+with f6:
+    clear = st.button("CLEAR", use_container_width=True)
+st.markdown("</div>", unsafe_allow_html=True)
+if clear:
+    st.cache_data.clear()
+    st.session_state.clear_filters += 1
+    st.rerun()
+# -----------------------------
+# Apply filters
+# -----------------------------
+df_f = df.copy()
+df_f = df_f[(df_f[DATE_COL].dt.date >= start_date) & (df_f[DATE_COL].dt.date <= end_date)]
+if "All" not in chambers:
+    df_f = df_f[df_f[CHAMBER_COL].astype(str).isin([str(x) for x in chambers])]
+df_f = df_f.dropna(subset=[CAT_MAIN])
+if df_f.empty:
+    st.warning("No rows match your filters.")
+    st.stop()
+tg = time_grain
+period_col = tg
+period_order = build_full_period_order(start_date, end_date, tg)
+# -----------------------------
+# KPI row
+# -----------------------------
+total_bills = int(df_f[BILL_ID_COL].nunique())
+num_main = int(df_f[CAT_MAIN].nunique())
+high_impact_bills = "-"
+impact_tooltip = "High Impact = bills in the top quartile of impact_rating_score, when available."
+if safe_col(df_f, IMPACT_SCORE_COL):
+    tmp = df_f[[BILL_ID_COL, IMPACT_SCORE_COL]].dropna()
+    if not tmp.empty:
+        bill_score = tmp.groupby(BILL_ID_COL)[IMPACT_SCORE_COL].max()
+        q75 = float(bill_score.quantile(0.75))
+        high_impact_bills = int((bill_score >= q75).sum())
+    else:
+        high_impact_bills = 0
+ch_bill_counts = df_f.groupby(df_f[CHAMBER_COL].astype(str))[BILL_ID_COL].nunique()
+total_ch_bills = int(ch_bill_counts.sum()) if len(ch_bill_counts) else 0
+house_label  = "HB" if "HB" in ch_bill_counts.index else (ch_bill_counts.index[0] if len(ch_bill_counts) else "HB")
+senate_label = "SB" if "SB" in ch_bill_counts.index else (ch_bill_counts.index[1] if len(ch_bill_counts) > 1 else "SB")
+house_pct    = pct(int(ch_bill_counts.get(house_label, 0)), total_ch_bills) if total_ch_bills else 0.0
+senate_pct   = pct(int(ch_bill_counts.get(senate_label, 0)), total_ch_bills) if total_ch_bills else 0.0
+st.markdown(
+    f"""
+<div class="kpi-grid">
+  <div class="kpi-card">
+    <div class="kpi-label">Bills</div>
+    <div class="kpi-value">{total_bills:,}</div>
+  </div>
+  <div class="kpi-card" title="{impact_tooltip}">
+    <div class="kpi-label">High Impact Bills</div>
+    <div class="kpi-value">{high_impact_bills if high_impact_bills != "-" else "-"}</div>
+  </div>
+  <div class="kpi-card">
+    <div class="kpi-label">Categories</div>
+    <div class="kpi-value">{num_main:,}</div>
+  </div>
+  <div class="kpi-card">
+    <div class="kpi-label">{str(house_label)} Bills %</div>
+    <div class="kpi-value">{house_pct:.1f}%</div>
+  </div>
+  <div class="kpi-card">
+    <div class="kpi-label">{str(senate_label)} Bills %</div>
+    <div class="kpi-value">{senate_pct:.1f}%</div>
+  </div>
+</div>
+""",
+    unsafe_allow_html=True,
+)
+# =====================================================
+# Manager Visual: Significant Category Shifts (Z-score)
+# Left: ranked bar chart of categories beyond ±2σ
+# Right: ranked interpretation table (directional)
+# =====================================================
+st.markdown("### Significant Category Shifts (vs Seasonal Baseline)")
+def compute_all_category_monthly_shares(df_all: pd.DataFrame, cat_col: str, bill_id_col: str, date_col: str) -> pd.DataFrame:
+    """
+    Compute monthly share (%) for ALL categories:
+      share = (unique bills in category that month) / (unique bills total that month) * 100
+    Returns columns: [_ym, year, calendar_month, Category, cat_bills, total_bills, share, period_str]
+    """
+    tmp = df_all.copy()
+    tmp["_ym"] = tmp[date_col].dt.to_period("M")
+    total_by_month = (
+        tmp.groupby("_ym")[bill_id_col].nunique()
+        .reset_index(name="total_bills")
+    )
+    cat_by_month = (
+        tmp.dropna(subset=[cat_col])
+           .groupby(["_ym", cat_col])[bill_id_col].nunique()
+           .reset_index(name="cat_bills")
+    )
+    merged = cat_by_month.merge(total_by_month, on="_ym", how="left")
+    merged["share"] = (merged["cat_bills"] / merged["total_bills"] * 100.0).replace([np.inf, -np.inf], 0).fillna(0)
+    merged["period_str"] = merged["_ym"].astype(str)
+    # Ensure baseline keys exist
+    merged["year"] = merged["_ym"].dt.year
+    merged["calendar_month"] = merged["_ym"].dt.month
+    merged = merged.rename(columns={cat_col: "Category"})
+    return merged.sort_values("_ym").reset_index(drop=True)
+def compute_category_seasonal_baseline_all(share_all: pd.DataFrame, lookback_years: int = 5) -> pd.DataFrame:
+    """
+    For each (Category, calendar_month), compute mean and std of share over last N years.
+    Derives year/calendar_month from _ym to avoid KeyError.
+    Returns: Category, calendar_month, mean_share, std_share
+    """
+    if share_all is None or share_all.empty:
+        return pd.DataFrame(columns=["Category", "calendar_month", "mean_share", "std_share"])
+    tmp = share_all.copy()
+    # Ensure required fields exist
+    if "_ym" not in tmp.columns:
+        if "period_str" in tmp.columns:
+            tmp["_ym"] = pd.PeriodIndex(tmp["period_str"], freq="M")
+        else:
+            return pd.DataFrame(columns=["Category", "calendar_month", "mean_share", "std_share"])
+    if "calendar_month" not in tmp.columns:
+        tmp["calendar_month"] = tmp["_ym"].dt.month
+    # Derive year reliably from _ym
+    tmp["year"] = tmp["_ym"].dt.year
+    latest_year = int(tmp["year"].max()) if not tmp.empty else 0
+    cutoff_year = latest_year - lookback_years
+    hist = tmp[tmp["year"] > cutoff_year].copy()
+    baseline = (
+        hist.groupby(["Category", "calendar_month"])["share"]
+            .agg(mean_share="mean", std_share="std")
+            .reset_index()
+    )
+    baseline["std_share"] = baseline["std_share"].fillna(0.0)
+    return baseline
+# --- Build monthly shares across FULL data for baseline ---
+share_all = compute_all_category_monthly_shares(df_full, CAT_MAIN, BILL_ID_COL, DATE_COL)
+baseline_all = compute_category_seasonal_baseline_all(share_all, lookback_years=5)
+# --- Choose the "current" month for the selected time window (end_date month) ---
+target_ym = pd.to_datetime(end_date).to_period("M")
+target_period_str = str(target_ym)
+current_month = share_all[share_all["_ym"] == target_ym].copy()
+if current_month.empty:
+    st.info("No monthly data available for the selected end date month to compute Z-scores.")
+else:
+    # Ensure calendar_month exists for the merge key
+    if "calendar_month" not in current_month.columns:
+        if "_ym" in current_month.columns:
+            current_month["calendar_month"] = current_month["_ym"].dt.month
+        elif "period_str" in current_month.columns:
+            current_month["_ym"] = pd.PeriodIndex(current_month["period_str"], freq="M")
+            current_month["calendar_month"] = current_month["_ym"].dt.month
+        else:
+            current_month["calendar_month"] = int(pd.to_datetime(end_date).month)
+    # Join baseline (Category x calendar_month) and compute Z-score
+    current_month = current_month.merge(
+        baseline_all,
+        on=["Category", "calendar_month"],
+        how="left"
+    )
+    current_month["z_score"] = np.where(
+        current_month["std_share"] > 0,
+        (current_month["share"] - current_month["mean_share"]) / current_month["std_share"],
+        0.0
+    )
+    # Optional % change vs baseline mean (relative)
+    current_month["pct_change_vs_mean"] = np.where(
+        current_month["mean_share"] > 0,
+        (current_month["share"] - current_month["mean_share"]) / current_month["mean_share"] * 100.0,
+        0.0
+    )
+    # Only include categories beyond ±2σ threshold
+    shifts_sig = current_month[current_month["z_score"].abs() >= 2.0].copy()
+    # --- Fallback: if none exceed ±2σ, show the largest movers by |Z| (clearly labeled) ---
+    show_fallback = False
+    if shifts_sig.empty:
+        show_fallback = True
+        st.info(f"No categories exceeded ±2σ in {target_period_str}. Showing largest movers instead (not statistically significant).")
+        shifts = current_month.copy()
+        shifts["abs_z"] = shifts["z_score"].abs()
+        shifts = shifts.sort_values("abs_z", ascending=False).head(12)
+    else:
+        shifts = shifts_sig.copy()
+        shifts["abs_z"] = shifts["z_score"].abs()
+        shifts = shifts.sort_values("abs_z", ascending=False)
+    # Color-coded: Blue above baseline, Red below baseline
+    shifts["Color"] = np.where(shifts["z_score"] >= 0, "Above baseline", "Below baseline")
+    color_map_shift = {"Above baseline": C_POSITIVE, "Below baseline": C_NEGATIVE}
+    left_col, right_col = st.columns([1.55, 1.0])
+    with left_col:
+        st.markdown("**What’s moving the most?**")
+        fig_shift = px.bar(
+            shifts.iloc[::-1],  # reverse so biggest appears at top in horizontal bar
+            x="z_score",
+            y="Category",
+            orientation="h",
+            color="Color",
+            color_discrete_map=color_map_shift,
+            template=PLOTLY_TEMPLATE,
+            custom_data=["share", "mean_share", "std_share", "pct_change_vs_mean", "period_str"],
+            labels={"z_score": "Z-score (σ from baseline)", "Category": ""},
+        )
+        fig_shift.update_traces(
+            hovertemplate=(
+                "<b>%{y}</b><br>"
+                "Z-score: %{x:.2f}<br>"
+                "Current share: %{customdata[0]:.2f}%<br>"
+                "Baseline mean: %{customdata[1]:.2f}%<br>"
+                "Baseline std: %{customdata[2]:.2f}<br>"
+                "% change vs mean: %{customdata[3]:.1f}%<br>"
+                "Month: %{customdata[4]}<extra></extra>"
+            )
+        )
+        fig_shift = tight_layout(fig_shift, height=max(420, len(shifts) * 28 + 180))
+        fig_shift.update_yaxes(showgrid=False)
+        fig_shift.update_xaxes(zeroline=True, zerolinecolor="#C9D3D6")
+        st.plotly_chart(fig_shift, use_container_width=True, config={"displayModeBar": False})
+        if show_fallback:
+            st.caption("Largest movers shown because none crossed the ±2σ significance threshold.")
+    with right_col:
+        st.markdown("**Current Significant Shifts**" if not show_fallback else "**Largest Movers (Below ±2σ)**")
+        # Directional ranking (NOT absolute):
+        # - Top: largest positive deviations
+        # - Bottom: largest negative deviations
+        pos = shifts[shifts["z_score"] > 0].sort_values("z_score", ascending=False).copy()
+        neg = shifts[shifts["z_score"] < 0].sort_values("z_score", ascending=True).copy()
+        def _mk_panel(df_part: pd.DataFrame, arrow: str):
+            if df_part.empty:
+                return pd.DataFrame(columns=["Category", "Direction", "Z-Score", "% Change", "Time Window"])
+            out = df_part[["Category", "z_score", "pct_change_vs_mean"]].copy()
+            out["Direction"] = arrow
+            out["Z-Score"] = out["z_score"].round(2)
+            out["% Change"] = out["pct_change_vs_mean"].round(1)
+            out["Time Window"] = f"{target_period_str}"
+            out = out.drop(columns=["z_score", "pct_change_vs_mean"])
+            return out
+        panel_pos = _mk_panel(pos, "↑")
+        panel_neg = _mk_panel(neg, "↓")
+        panel = pd.concat([panel_pos, panel_neg], axis=0).reset_index(drop=True)
+        panel.insert(0, "Rank", np.arange(1, len(panel) + 1))
+        st.dataframe(panel, use_container_width=True, height=380)
+        st.caption("Directional ranking: accelerators (↑) first, contractions (↓) last. Threshold: |Z| ≥ 2 (fallback shows top movers if none qualify).")
+# -----------------------------
+# Category ranking
+# -----------------------------
+st.markdown("### Category Ranking")
+cat_rank = (
+    df_f.groupby(CAT_MAIN)[BILL_ID_COL].nunique()
+    .sort_values(ascending=False)
+    .reset_index(name="Bills")
+    .head(20)
+)
+cat_hover_dir = []
+cat_hover_impact = []
+for cat in cat_rank[CAT_MAIN].astype(str).tolist():
+    sdf = df_f[df_f[CAT_MAIN].astype(str) == str(cat)].copy()
+    direction, _slope = classify_direction(sdf, period_col, period_order, BILL_ID_COL)
+    if safe_col(sdf, IMPACT_SCORE_COL):
+        bmax = sdf[[BILL_ID_COL, IMPACT_SCORE_COL]].dropna().groupby(BILL_ID_COL)[IMPACT_SCORE_COL].max()
+        avg_imp = float(bmax.mean()) if len(bmax) else float("nan")
+    else:
+        avg_imp = float("nan")
+    cat_hover_dir.append(direction)
+    cat_hover_impact.append(None if np.isnan(avg_imp) else round(avg_imp, 2))
+cat_rank2 = cat_rank.copy()
+cat_rank2["Direction"] = cat_hover_dir
+cat_rank2["AvgImpact"] = cat_hover_impact
+fig_rank = px.bar(
+    cat_rank2.iloc[::-1],
+    x="Bills",
+    y=CAT_MAIN,
+    orientation="h",
+    labels={"Bills": "Bills", CAT_MAIN: ""},
+    template=PLOTLY_TEMPLATE,
+    custom_data=["Direction", "AvgImpact"],
+)
+fig_rank.update_traces(
+    marker_color=C_LAZURITE_BLUE,
+    hovertemplate=(
+        "<b>%{y}</b><br>"
+        "Bills: %{x}<br>"
+        "Direction: %{customdata[0]}<br>"
+        "Avg Political Impact: %{customdata[1]}<extra></extra>"
+    )
+)
+fig_rank = tight_layout(fig_rank, height=420)
+fig_rank.update_yaxes(showgrid=False)
+st.plotly_chart(fig_rank, use_container_width=True, config={"displayModeBar": False})
+# =====================================================
+# Row 1: Category Share (with Baseline) + Subcategory Drivers
+# =====================================================
+df_cat = df_f[df_f[CAT_MAIN].astype(str) == str(selected_cat)].copy()
+# --- STEP 1-8: Compute full share series on entire df_full for baseline ---
+share_series_full = compute_monthly_share_series(
+    df_full, selected_cat, CAT_MAIN, BILL_ID_COL, DATE_COL
+)
+seasonal_baseline = compute_seasonal_baseline(share_series_full, lookback_years=5)
+total_monthly_full = (
+    df_full.groupby(df_full[DATE_COL].dt.to_period("M"))[BILL_ID_COL]
+    .nunique()
+    .reset_index(name="total_bills")
+)
+reg_stats = compute_regression_on_share(share_series_full, total_monthly_full)
+analysis_df = compute_zscore_and_residuals(share_series_full, seasonal_baseline, reg_stats)
+# Build period order strings for filter alignment (months only for baseline chart)
+month_period_order = build_full_period_order(start_date, end_date, "month")
+# --- Subcategory section ---
+df_sub = df_cat.copy()
+cutoff = None
+if sub_time != "Overall":
+    days = 30 if sub_time == "Last 30 days" else 60
+    cutoff = (pd.to_datetime(end_date) - pd.Timedelta(days=days)).date()
+    df_sub = df_sub[df_sub[DATE_COL].dt.date >= cutoff]
+emerging_map = {}
+if cutoff is not None and safe_col(df_cat, CAT_SUB):
+    before = df_cat[df_cat[DATE_COL].dt.date < cutoff]
+    before_set = set(before[CAT_SUB].dropna().astype(str).unique().tolist())
+    window_set = set(df_sub[CAT_SUB].dropna().astype(str).unique().tolist())
+    for s in window_set:
+        emerging_map[s] = (s not in before_set)
+sub_ct = pd.DataFrame()
+if CAT_SUB in df_sub.columns and not df_sub[CAT_SUB].isna().all():
+    sub_ct = (
+        df_sub.dropna(subset=[CAT_SUB])
+              .groupby(CAT_SUB)[BILL_ID_COL].nunique()
+              .reset_index(name="Bills")
+              .sort_values("Bills", ascending=False)
+              .head(12)
+    )
+hover_dir = []
+hover_impact = []
+for sub in sub_ct[CAT_SUB].astype(str).tolist():
+    sdf = df_sub[df_sub[CAT_SUB].astype(str) == str(sub)].copy()
+    direction, _slope = classify_direction(sdf, period_col, period_order, BILL_ID_COL)
+    if cutoff is not None and emerging_map.get(sub, False):
+        direction = "Emerging"
+    if safe_col(sdf, IMPACT_SCORE_COL):
+        bmax = sdf[[BILL_ID_COL, IMPACT_SCORE_COL]].dropna().groupby(BILL_ID_COL)[IMPACT_SCORE_COL].max()
+        avg_imp = float(bmax.mean()) if len(bmax) else float("nan")
+    else:
+        avg_imp = float("nan")
+    hover_dir.append(direction)
+    hover_impact.append(None if np.isnan(avg_imp) else round(avg_imp, 2))
+sub_ct2 = sub_ct.copy()
+sub_ct2["Direction"] = hover_dir
+sub_ct2["AvgImpact"] = hover_impact
+r1a, r1b = st.columns(2)
+with r1a:
+    st.markdown("### Category Share Over Time")
+    fig_share = plot_category_share_with_baseline(
+        analysis_df, month_period_order, reg_stats["significant_growth"]
+    )
+    if fig_share.data:
+        st.plotly_chart(fig_share, use_container_width=True, config={"displayModeBar": False})
+    else:
+        st.info("No share data available for this selection.")
+with r1b:
+    st.markdown("### Subcategory Drivers")
+    if sub_ct2.empty:
+        st.info("No subcategory data available for this selection/window.")
+    else:
+        show = sub_ct2.sort_values("Bills", ascending=True)
+        fig = px.bar(
+            show,
+            x="Bills",
+            y=CAT_SUB,
+            orientation="h",
+            template=PLOTLY_TEMPLATE,
+            labels={"Bills": "Bills", CAT_SUB: ""},
+            custom_data=["Direction", "AvgImpact"],
+        )
+        fig.update_traces(
+            marker_color=C_PUMPING_SPICE,
+            hovertemplate=(
+                "<b>%{y}</b><br>"
+                "Bills: %{x}<br>"
+                "Direction: %{customdata[0]}<br>"
+                "Avg Political Impact: %{customdata[1]}<extra></extra>"
+            ),
+        )
+        fig = tight_layout(fig, height=420)
+        fig.update_yaxes(showgrid=False)
+        st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
+# -----------------------------
+# Row 2: Policy Direction + TF-IDF
+# -----------------------------
+r2a, r2b = st.columns([1.55, 1.0])
+inc_terms = explode_terms(df_f, INC_COL, stopwords=STOPWORDS, min_len=3)
+dec_terms = explode_terms(df_f, DEC_COL, stopwords=STOPWORDS, min_len=3)
+if not inc_terms.empty:
+    inc_terms["_period"] = inc_terms[period_col]
+if not dec_terms.empty:
+    dec_terms["_period"] = dec_terms[period_col]
+inc_ts = inc_terms.groupby([period_col])["mentions"].sum().reindex(period_order, fill_value=0).reset_index()
+inc_ts.columns = [period_col, "inc"]
+dec_ts = dec_terms.groupby([period_col])["mentions"].sum().reindex(period_order, fill_value=0).reset_index()
+dec_ts.columns = [period_col, "dec"]
+net_ts = pd.merge(inc_ts, dec_ts, on=period_col, how="left").fillna(0)
+net_ts["net"] = net_ts["inc"] - net_ts["dec"]
+df_inc_rows = df_f[df_f[INC_COL].notna()].copy() if INC_COL in df_f.columns else pd.DataFrame(columns=df_f.columns)
+df_dec_rows = df_f[df_f[DEC_COL].notna()].copy() if DEC_COL in df_f.columns else pd.DataFrame(columns=df_f.columns)
+df_inc_rows["_period"] = df_inc_rows[period_col] if not df_inc_rows.empty else []
+df_dec_rows["_period"] = df_dec_rows[period_col] if not df_dec_rows.empty else []
+def top_keywords_for_period(term_df: pd.DataFrame, period_value, k=6) -> pd.DataFrame:
+    if term_df is None or term_df.empty:
+        return pd.DataFrame()
+    sub = term_df[term_df["_period"] == period_value]
+    if sub.empty:
+        return pd.DataFrame()
+    vc = sub["term"].value_counts().head(k).reset_index()
+    vc.columns = ["Keyword", "Mentions"]
+    return vc
+def top_beneficiaries_for_period(df_rows: pd.DataFrame, period_value, benef_col: str, k=6) -> pd.DataFrame:
+    if benef_col not in df_rows.columns or df_rows.empty:
+        return pd.DataFrame()
+    sub = df_rows[df_rows["_period"] == period_value].copy()
+    if sub.empty or sub[benef_col].dropna().empty:
+        return pd.DataFrame()
+    sub["_b"] = sub[benef_col].apply(_split_listlike)
+    sub = sub.explode("_b").dropna(subset=["_b"])
+    sub["_b"] = sub["_b"].astype(str).str.strip()
+    sub = sub[sub["_b"].str.len() > 0]
+    if sub.empty:
+        return pd.DataFrame()
+    vc = sub["_b"].value_counts().head(k).reset_index()
+    vc.columns = ["Beneficiary", "Mentions"]
+    return vc
+inc_kw_short, dec_kw_short, inc_b_short, dec_b_short = [], [], [], []
+for p in net_ts[period_col].tolist():
+    inc_kw = top_keywords_for_period(inc_terms, p, k=6)
+    dec_kw = top_keywords_for_period(dec_terms, p, k=6)
+    inc_b = top_beneficiaries_for_period(df_inc_rows, p, BENEF_COL, k=6) if safe_col(df_f, BENEF_COL) else pd.DataFrame()
+    dec_b = top_beneficiaries_for_period(df_dec_rows, p, BENEF_COL, k=6) if safe_col(df_f, BENEF_COL) else pd.DataFrame()
+    inc_kw_short.append(short_list(inc_kw["Keyword"].tolist() if not inc_kw.empty else [], 3))
+    dec_kw_short.append(short_list(dec_kw["Keyword"].tolist() if not dec_kw.empty else [], 3))
+    inc_b_short.append(short_list(inc_b["Beneficiary"].tolist() if not inc_b.empty else [], 2))
+    dec_b_short.append(short_list(dec_b["Beneficiary"].tolist() if not dec_b.empty else [], 2))
+net_ts["inc_kw_short"] = inc_kw_short
+net_ts["dec_kw_short"] = dec_kw_short
+net_ts["inc_b_short"] = inc_b_short
+net_ts["dec_b_short"] = dec_b_short
+with r2a:
+    st.markdown("### Policy Direction Over Time")
+    if net_ts.empty or (net_ts["inc"].sum() == 0 and net_ts["dec"].sum() == 0):
+        st.info("No increasing/decreasing aspects available under current filters.")
+    else:
+        custom = np.stack(
+            [
+                net_ts["inc_kw_short"].astype(str),
+                net_ts["dec_kw_short"].astype(str),
+                net_ts["inc_b_short"].astype(str),
+                net_ts["dec_b_short"].astype(str),
+            ],
+            axis=1,
+        )
+        fig = go.Figure()
+        fig.add_trace(go.Bar(
+            x=net_ts[period_col],
+            y=net_ts["inc"],
+            name="Increasing",
+            marker_color=C_POSITIVE,
+            customdata=custom,
+            hovertemplate="<b>%{x}</b><br>Increasing: %{y}<br>Keywords: %{customdata[0]}<br>Beneficiaries: %{customdata[2]}<extra></extra>",
+        ))
+        fig.add_trace(go.Bar(
+            x=net_ts[period_col],
+            y=-net_ts["dec"],
+            name="Decreasing",
+            marker_color=C_NEGATIVE,
+            customdata=custom,
+            hovertemplate="<b>%{x}</b><br>Decreasing: %{y:.0f}<br>Keywords: %{customdata[1]}<br>Beneficiaries: %{customdata[3]}<extra></extra>",
+        ))
+        fig.add_trace(go.Scatter(
+            x=net_ts[period_col],
+            y=net_ts["net"],
+            mode="lines+markers",
+            name="Net",
+            line=dict(color=C_TRAPPED_DARKNESS, width=2),
+            hovertemplate="<b>%{x}</b><br>Net: %{y}<extra></extra>",
+        ))
+        fig.update_layout(
+            template=PLOTLY_TEMPLATE,
+            barmode="relative",
+            height=420,
+            margin=dict(l=8, r=8, t=8, b=8),
+            hovermode="x unified",
+            legend=dict(orientation="h", yanchor="bottom", y=-0.22, xanchor="center", x=0.5),
+            yaxis_title="Mentions",
+            xaxis_title="",
+            plot_bgcolor="white",
+            paper_bgcolor="white",
+        )
+        fig.update_xaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
+        fig.update_yaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0",
+                         zeroline=True, zerolinecolor="#C9D3D6")
+        st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
+with r2b:
+    st.markdown("### Top Keywords")
+    tfidf_mode = st.selectbox(
+        "TF-IDF Source",
+        ["Motivation", "Intent", "Legislative Strategy"],
+        index=0
+    )
+    tfidf_text_col = KW_SOURCES[tfidf_mode]
+    if not safe_col(df_f, tfidf_text_col):
+        st.info(f"Column `{tfidf_text_col}` not available.")
+    else:
+        df_rest = df_f[df_f[CAT_MAIN].astype(str) != str(selected_cat)].copy()
+        tf_phrases = build_contrastive_tfidf(df_cat, df_rest, BILL_ID_COL, tfidf_text_col, top_k=15)
+        if not tf_phrases:
+            st.info("TF-IDF returned no meaningful category-distinct phrases for this slice.")
+        else:
+            kw_tbl = pd.DataFrame(tf_phrases, columns=["Keyword", "Distinctiveness Score"])
+            kw_tbl.index = np.arange(1, len(kw_tbl) + 1)
+            st.dataframe(kw_tbl, use_container_width=True, height=300)
+# =====================================================
+# Row 3: Subcategory Momentum (STEP 9) + Heatmap (STEP 10)
+# =====================================================
+st.markdown("")
+r3a, r3b = st.columns(2)
+with r3a:
+    st.markdown("### Subcategory Momentum")
+    # df_full_cat: full historical data for selected category (for 5-yr slope)
+    df_full_cat = df_full[df_full[CAT_MAIN].astype(str) == str(selected_cat)].copy()
+    # STEP 9: pass both filtered df_cat (short-term pct change) and df_full_cat (5-yr slope)
+    sub_momentum = compute_subcategory_momentum(
+        df_cat, df_full_cat, CAT_SUB, BILL_ID_COL, period_col, period_order
+    )
+    if sub_momentum.empty:
+        st.info("Not enough data to compute momentum.")
+    else:
+        # SlopeScaled = 5-year regression slope * 100 (momentum strength)
+        sub_momentum["SlopeScaled"] = sub_momentum["Slope"] * 100.0
+        sub_momentum["SlopeScaled"] = pd.to_numeric(sub_momentum["SlopeScaled"], errors="coerce").fillna(0.0)
+        eps = 1e-4
+        sub_momentum["Direction"] = np.where(
+            sub_momentum["SlopeScaled"] > eps, "Rising",
+            np.where(sub_momentum["SlopeScaled"] < -eps, "Falling", "Stable")
+        )
+        # Show top movers by absolute slope + top stable
+        movers = sub_momentum[sub_momentum["Direction"] != "Stable"].copy()
+        stable = sub_momentum[sub_momentum["Direction"] == "Stable"].copy()
+        movers = movers.reindex(movers["SlopeScaled"].abs().sort_values(ascending=False).index).head(10)
+        stable = stable.head(5)
+        show_df = pd.concat([movers, stable], axis=0).drop_duplicates("Subcategory")
+        if show_df.empty:
+            show_df = sub_momentum.head(12).copy()
+        # Ensure bars are always visible (min visible length)
+        min_visible = 0.20
+        show_df = show_df.copy()
+        show_df["DisplaySlope"] = show_df["SlopeScaled"]
+        near_zero = show_df["DisplaySlope"].abs() < min_visible
+        show_df.loc[near_zero & (show_df["Direction"] == "Rising"),  "DisplaySlope"] = min_visible
+        show_df.loc[near_zero & (show_df["Direction"] == "Falling"), "DisplaySlope"] = -min_visible
+        show_df.loc[near_zero & (show_df["Direction"] == "Stable"),  "DisplaySlope"] = min_visible * 0.6
+        show_df = show_df.sort_values("DisplaySlope", ascending=True)
+        show_df["AvgPctChange"] = pd.to_numeric(show_df["AvgPctChange"], errors="coerce").fillna(0.0)
+        show_df["SlopeScaled"]  = pd.to_numeric(show_df["SlopeScaled"],  errors="coerce").fillna(0.0)
+        color_map = {"Rising": C_POSITIVE, "Falling": C_NEGATIVE, "Stable": C_STABLE}
+        fig = px.bar(
+            show_df,
+            x="DisplaySlope",
+            y="Subcategory",
+            color="Direction",
+            orientation="h",
+            color_discrete_map=color_map,
+            template=PLOTLY_TEMPLATE,
+            custom_data=["SlopeScaled", "Direction", "AvgPctChange"],
+            labels={"DisplaySlope": "5-Yr Momentum Slope (x100)", "Subcategory": ""},
+        )
+        fig.update_traces(
+            hovertemplate=(
+                "<b>%{y}</b><br>"
+                "Direction: %{customdata[1]}<br>"
+                "5-Yr Regression Slope (x100): %{customdata[0]:.3f}<br>"
+                "Short-Term Avg % Change: %{customdata[2]:.1f}%<extra></extra>"
+            )
+        )
+        max_abs = float(np.nanmax(np.abs(show_df["DisplaySlope"].to_numpy()))) if len(show_df) else 1.0
+        max_abs = max(max_abs, 1.0)
+        fig.update_layout(
+            template=PLOTLY_TEMPLATE,
+            height=520,
+            margin=dict(l=8, r=8, t=8, b=8),
+            xaxis_title="5-Yr Momentum Slope (x100)",
+            yaxis_title="",
+            plot_bgcolor="white",
+            paper_bgcolor="white",
+            legend=dict(orientation="h", yanchor="bottom", y=-0.22, xanchor="center", x=0.5),
+            barmode="relative",
+        )
+        fig.update_xaxes(
+            range=[-max_abs * 1.15, max_abs * 1.15],
+            showgrid=True, gridcolor="#EDF2F4",
+            showline=True, linecolor="#D6DEE0",
+            zeroline=True, zerolinecolor="#C9D3D6",
+        )
+        fig.update_yaxes(showgrid=False)
+        st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
+with r3b:
+    # -------------------------------------------------------
+    # STEP 10: Heatmap — Intended Beneficiaries × Increasing Aspects
+    # Filter: top 10 categories with bill count > K threshold
+    # Remove empty/very low-density cells
+    # Conditional formatting: annotate each cell with actual bill count
+    # -------------------------------------------------------
+    st.markdown("### Beneficiaries × Increasing Aspects")
+    hc1, hc2 = st.columns(2)
+    with hc1:
+        cat_bill_thresh = st.slider(
+            "Min Bills per Category", min_value=5, max_value=100, value=20, step=5,
+            help="Only include categories with at least this many unique bills"
+        )
+    with hc2:
+        min_cell_thresh = st.slider(
+            "Min Bills per Cell", min_value=1, max_value=15, value=2, step=1,
+            help="Remove cells with fewer than this many bills (sparse cell filter)"
+        )
+    topN_benef = st.slider("Top N Beneficiaries", min_value=5, max_value=25, value=10, step=1)
+    if not safe_col(df_f, BENEF_COL) or not safe_col(df_f, INC_COL):
+        st.caption("Beneficiary or Increasing Aspects data not available.")
+    else:
+        # STEP 10a: Top 10 categories, filtered by bill count > K threshold
+        cat_counts = df_f.groupby(CAT_MAIN)[BILL_ID_COL].nunique()
+        eligible_cats = cat_counts[cat_counts >= cat_bill_thresh].sort_values(ascending=False)
+        if eligible_cats.empty:
+            st.caption(f"No categories have ≥ {cat_bill_thresh} bills. Try lowering the threshold.")
+        else:
+            top10_cats = eligible_cats.head(10).index.tolist()
+            df_heat = df_f[df_f[CAT_MAIN].isin(top10_cats)].copy()
+            # Explode beneficiaries
+            df_heat["_benef"] = df_heat[BENEF_COL].apply(_split_listlike)
+            df_heat = df_heat.explode("_benef").dropna(subset=["_benef"])
+            df_heat["_benef"] = df_heat["_benef"].astype(str).str.strip()
+            df_heat = df_heat[df_heat["_benef"].str.len() > 0]
+            # Explode increasing aspects and clean
+            df_heat["_inc"] = df_heat[INC_COL].apply(_split_listlike)
+            df_heat = df_heat.explode("_inc").dropna(subset=["_inc"])
+            df_heat["_inc"] = df_heat["_inc"].astype(str).str.strip().str.lower()
+            df_heat["_inc"] = df_heat["_inc"].str.replace(r"[^a-z0-9\s\-]", "", regex=True).str.strip()
+            df_heat = df_heat[df_heat["_inc"].str.len() >= 3]
+            df_heat = df_heat[~df_heat["_inc"].isin(STOPWORDS)]
+            if df_heat.empty:
+                st.caption("No usable beneficiary × increasing aspects data.")
+            else:
+                # Keep top N beneficiaries and top 15 increasing aspect terms
+                top_benef = df_heat["_benef"].value_counts().head(topN_benef).index.tolist()
+                top_inc   = df_heat["_inc"].value_counts().head(15).index.tolist()
+                df_heat = df_heat[
+                    df_heat["_benef"].isin(top_benef) &
+                    df_heat["_inc"].isin(top_inc)
+                ].copy()
+                benef_heat = (
+                    df_heat.groupby(["_benef", "_inc"])[BILL_ID_COL].nunique()
+                    .reset_index(name="bills")
+                )
+                # STEP 10b: Remove empty / very low-density cells (< min_cell_thresh)
+                benef_heat = benef_heat[benef_heat["bills"] >= min_cell_thresh]
+                if benef_heat.empty:
+                    st.caption(f"No cells with ≥ {min_cell_thresh} bills. Try lowering the threshold.")
+                else:
+                    pivot = benef_heat.pivot(index="_benef", columns="_inc", values="bills").fillna(0)
+                    # Sort rows and columns by total density (highest at top/left)
+                    pivot = pivot.loc[
+                        pivot.sum(axis=1).sort_values(ascending=False).index,
+                        pivot.sum(axis=0).sort_values(ascending=False).index
+                    ]
+                    z_actual = pivot.values.astype(float)
+                    # STEP 10c: log-scale for color (handles outliers gracefully)
+                    z_scaled = np.log1p(z_actual)
+                    # STEP 10d: Conditional formatting — annotate each cell with actual count
+                    # Only show text for cells above the sparse threshold (already filtered)
+                    annotations = []
+                    for i, row_label in enumerate(pivot.index):
+                        for j, col_label in enumerate(pivot.columns):
+                            val = int(z_actual[i, j])
+                            if val > 0:
+                                # White text for dark cells, dark for light cells
+                                max_val = z_scaled.max() if z_scaled.max() > 0 else 1
+                                brightness = z_scaled[i, j] / max_val
+                                font_color = "white" if brightness > 0.55 else C_TRAPPED_DARKNESS
+                                annotations.append(
+                                    dict(
+                                        x=col_label,
+                                        y=row_label,
+                                        text=str(val),
+                                        showarrow=False,
+                                        font=dict(color=font_color, size=9),
+                                        xref="x", yref="y",
+                                    )
+                                )
+                    fig = go.Figure(data=go.Heatmap(
+                        z=z_scaled,
+                        x=pivot.columns.astype(str).tolist(),
+                        y=pivot.index.astype(str).tolist(),
+                        colorscale=[
+                            [0.0,  "#F2F5F6"],
+                            [0.25, "#C8D9DE"],
+                            [0.5,  "#7FAAB7"],
+                            [0.75, "#3D7285"],
+                            [1.0,  C_LAZURITE_BLUE],
+                        ],
+                        colorbar=dict(
+                            title="log(1+bills)",
+                            tickfont=dict(size=10),
+                            thickness=12,
+                            len=0.8,
+                        ),
+                        customdata=z_actual,
+                        hovertemplate=(
+                            "Beneficiary: %{y}<br>"
+                            "Aspect: %{x}<br>"
+                            "Unique Bills: %{customdata:.0f}<extra></extra>"
+                        ),
+                        xgap=1,
+                        ygap=1,
+                    ))
+                    fig.update_layout(
+                        template=PLOTLY_TEMPLATE,
+                        height=max(520, len(pivot.index) * 30 + 120),
+                        margin=dict(l=8, r=8, t=8, b=80),
+                        xaxis_title="Increasing Aspect",
+                        yaxis_title="",
+                        plot_bgcolor="white",
+                        paper_bgcolor="white",
+                        xaxis=dict(tickangle=-40, tickfont=dict(size=10)),
+                        yaxis=dict(tickfont=dict(size=10)),
+                        annotations=annotations,
+                    )
+                    st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})

dockerignore ADDED Viewed

	@@ -0,0 +1,13 @@

+**/.git
+**/__pycache__
+*.pyc
+*.pkl
+*.joblib
+*.pt
+*.bin
+*.zip
+*.tar
+*.gz
+notebooks/
+outputs/
+data/

features_standardized_11_renamed.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edc651cba51bc217650a48a8c4d66d9e329f19711779ad69e851816d057852c2
+size 538177112

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit==1.37.1
+pandas==2.2.2
+numpy==1.26.4
+pyarrow==17.0.0
+plotly==5.23.0
+scikit-learn==1.4.2

utils.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import re
+import ast
+import numpy as np
+import pandas as pd
+DATE_COL = "status_date"
+# Keep only what we need for the dashboard (cuts memory a LOT)
+NEEDED_COLS = [
+    "bill_id",
+    "session",
+    "chamber",
+    "bill_number",
+    "status_date",
+    "policy_domain_standardized",
+    "category_main_label",
+    "category_sub_label",
+    "intent_standardized",
+    "policy_direction_classifications",
+    "category_main_keywords",
+    "category_sub_keywords",
+    "category_main_llama_summary_keywords",
+    "category_sub_llama_summary_keywords",
+    "legislative_goal_standardized",
+    "impact_rating_standardized",
+    "impact_rating_score",
+]
+KEYWORD_COLS = [
+    "category_main_keywords",
+    "category_sub_keywords",
+    "category_main_llama_summary_keywords",
+    "category_sub_llama_summary_keywords",
+]
+def _safe_listify(x):
+    """Turn list-like cells or strings into list[str]."""
+    if x is None:
+        return []
+    if isinstance(x, float) and np.isnan(x):
+        return []
+    if isinstance(x, list):
+        return [str(i).strip() for i in x if str(i).strip()]
+    s = str(x).strip()
+    if not s or s.lower() in {"nan", "none", "null"}:
+        return []
+    if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
+        try:
+            parsed = ast.literal_eval(s)
+            if isinstance(parsed, (list, tuple, set)):
+                return [str(i).strip() for i in parsed if str(i).strip()]
+        except Exception:
+            pass
+    parts = re.split(r"[,\|;]\s*", s)
+    return [p.strip() for p in parts if p.strip()]
+def load_dataset(path: str) -> pd.DataFrame:
+    if path.lower().endswith(".parquet"):
+        all_cols = pd.read_parquet(path, engine="pyarrow").columns
+        cols = [c for c in NEEDED_COLS if c in all_cols]
+        df = pd.read_parquet(path, columns=cols)
+    elif path.lower().endswith(".csv"):
+        # for csv, we can't cheaply read columns list; just try usecols and fallback
+        try:
+            df = pd.read_csv(path, usecols=NEEDED_COLS)
+        except Exception:
+            df = pd.read_csv(path)
+            df = df[[c for c in NEEDED_COLS if c in df.columns]]
+    else:
+        raise ValueError("Supported formats: .parquet or .csv")
+    if DATE_COL not in df.columns:
+        raise ValueError(f"Expected a date column named '{DATE_COL}'")
+    df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
+    df = df[df[DATE_COL].notna()].copy()
+    df["year"] = df[DATE_COL].dt.year
+    df["month"] = df[DATE_COL].dt.to_period("M").dt.to_timestamp()
+    df["week"] = df[DATE_COL].dt.to_period("W").dt.start_time
+    return df
+def apply_filters(
+    df: pd.DataFrame,
+    date_min=None,
+    date_max=None,
+    sessions=None,
+    chambers=None,
+    policy_domains=None,
+    category_main=None,
+    category_sub=None,
+    intents=None,
+    policy_dirs=None,
+):
+    out = df.copy()
+    if date_min is not None:
+        out = out[out["status_date"] >= pd.to_datetime(date_min)]
+    if date_max is not None:
+        out = out[out["status_date"] <= pd.to_datetime(date_max)]
+    def _filter_in(col, values):
+        nonlocal out
+        if values and "All" not in values:
+            out = out[out[col].isin(values)]
+    _filter_in("session", sessions)
+    _filter_in("chamber", chambers)
+    _filter_in("policy_domain_standardized", policy_domains)
+    _filter_in("category_main_label", category_main)
+    _filter_in("category_sub_label", category_sub)
+    _filter_in("intent_standardized", intents)
+    _filter_in("policy_direction_classifications", policy_dirs)
+    return out
+def explode_keywords(df: pd.DataFrame, keyword_col: str) -> pd.DataFrame:
+    keep_cols = [
+        "bill_id",
+        "status_date",
+        "month",
+        "week",
+        "session",
+        "chamber",
+        "policy_domain_standardized",
+        "category_main_label",
+        "category_sub_label",
+        "intent_standardized",
+        "policy_direction_classifications",
+        keyword_col,
+    ]
+    keep_cols = [c for c in keep_cols if c in df.columns]
+    tmp = df[keep_cols].copy()
+    tmp["keyword_list"] = tmp[keyword_col].apply(_safe_listify)
+    tmp = tmp.explode("keyword_list", ignore_index=True)
+    tmp = tmp.rename(columns={"keyword_list": "keyword"})
+    tmp["keyword"] = tmp["keyword"].astype(str).str.strip()
+    tmp = tmp[(tmp["keyword"].notna()) & (tmp["keyword"] != "") & (tmp["keyword"].str.lower() != "nan")]
+    tmp["keyword_norm"] = (
+        tmp["keyword"]
+        .str.lower()
+        .str.replace(r"\s+", " ", regex=True)
+        .str.replace(r"[^a-z0-9 \-_/]", "", regex=True)
+        .str.strip()
+    )
+    tmp = tmp[tmp["keyword_norm"].str.len() >= 3]
+    return tmp
+def keyword_trends(df_long: pd.DataFrame, time_grain="month", top_n=15):
+    tg = "month" if time_grain == "month" else "week"
+    top = (
+        df_long.groupby("keyword_norm")
+        .size()
+        .reset_index(name="count")
+        .sort_values("count", ascending=False)
+        .head(top_n)
+    )
+    top_set = set(top["keyword_norm"].tolist())
+    base = df_long[df_long["keyword_norm"].isin(top_set)]
+    ts = (
+        base.groupby([tg, "keyword_norm"])
+        .size()
+        .reset_index(name="mentions")
+        .sort_values([tg, "mentions"], ascending=[True, False])
+    )
+    return top, ts