ralate2 commited on
Commit
febe6ca
·
verified ·
1 Parent(s): 8939bec

Upload 7 files

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +15 -0
  3. README.md +12 -0
  4. app.py +1716 -0
  5. dockerignore +13 -0
  6. features_standardized_11_renamed.parquet +3 -0
  7. requirements.txt +8 -0
  8. utils.py +188 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ features_standardized_11_renamed.parquet filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends curl \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
10
+
11
+ COPY . .
12
+
13
+ EXPOSE 7860
14
+
15
+ CMD ["sh", "-c", "export STREAMLIT_SERVER_PORT='' && PORT_TO_USE=${PORT:-7860} && echo PORT_TO_USE=$PORT_TO_USE && streamlit run app.py --server.address=0.0.0.0 --server.port=$PORT_TO_USE --server.headless=true --server.enableCORS=false --server.enableXsrfProtection=false --server.runOnSave=false --server.fileWatcherType=none"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Legislation Dashboard
3
+ emoji: 📈
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 7860
8
+ ---
9
+
10
+ # Legislative Trends Dashboard
11
+
12
+ Upload your parquet or CSV file to visualize legislative trends.
app.py ADDED
@@ -0,0 +1,1716 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import numpy as np
4
+ import pandas as pd
5
+ import streamlit as st
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from scipy import stats
9
+
10
+ # Optional (legacy TF-IDF import kept harmlessly)
11
+ try:
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ _HAS_SK = True
14
+ except Exception:
15
+ _HAS_SK = False
16
+
17
+ # -----------------------------
18
+ # Page config
19
+ # -----------------------------
20
+ st.set_page_config(
21
+ page_title="Legislative Trends Dashboard",
22
+ layout="wide",
23
+ initial_sidebar_state="collapsed",
24
+ )
25
+
26
+ # -----------------------------
27
+ # Palette
28
+ # -----------------------------
29
+ C_TRAPPED_DARKNESS = "#0F1F38"
30
+ C_CEDAR_PLANK = "#8E7970"
31
+ C_PUMPING_SPICE = "#F55449" # negative
32
+ C_LAZURITE_BLUE = "#1B4B5A" # positive
33
+ C_POSITIVE = C_LAZURITE_BLUE
34
+ C_NEGATIVE = C_PUMPING_SPICE
35
+ C_STABLE = C_CEDAR_PLANK
36
+
37
+ PLOTLY_TEMPLATE = "plotly_white"
38
+
39
+ DEFAULT_CANDIDATES = [
40
+ "features_standardized_11_renamed.parquet",
41
+ ]
42
+
43
+ # Full data range (all years available for baseline)
44
+ DATA_START_FULL = pd.to_datetime("2019-01-09").date()
45
+ DATA_END_FULL = pd.to_datetime("2026-02-06").date()
46
+
47
+ # Display/filter range
48
+ DATA_START = pd.to_datetime("2019-01-09").date()
49
+ DATA_END = pd.to_datetime("2026-02-06").date()
50
+
51
+ _SPLIT_RE = re.compile(r"[,\|;/\n\t]+")
52
+
53
+ STOPWORDS = {
54
+ "bill", "bills", "act", "acts", "amend", "amends", "amended", "amendment", "amendments",
55
+ "illinois", "state", "code", "section", "sections", "law", "laws", "new", "provide", "provides",
56
+ "making", "make", "made", "relating", "regarding", "including", "include", "includes", "within",
57
+ "existing", "technical", "resolution", "resolutions", "effective", "date", "public",
58
+ "department", "agency", "program", "programs", "general", "shall", "may", "must", "also",
59
+ "one", "two", "three", "per", "use", "used", "using", "would", "could", "can", "like",
60
+ "not", "no", "yes", "etc", "among", "upon", "require", "requires", "required", "requirement",
61
+ "establish", "establishes", "established", "create", "creates", "created", "implementation",
62
+ "board", "boards", "commission", "commissions", "report", "reports", "reporting",
63
+ "information", "data", "system", "systems", "process", "processes", "administration",
64
+ "student", "students", "education", "educational", "school", "schools",
65
+ "support", "and", "the", "for", "with", "that", "this", "from", "have", "has", "had",
66
+ "be", "been", "being", "are", "is", "was", "were", "will", "would", "should", "could",
67
+ "may", "might", "must", "can", "shall", "need", "needs", "needed", "such", "other",
68
+ "any", "all", "each", "some", "more", "most", "than", "into", "through", "between",
69
+ "under", "over", "about", "against", "during", "after", "before", "above", "below",
70
+ "up", "down", "in", "out", "on", "off", "to", "at", "by", "of", "as", "or", "but", "if",
71
+ "when", "where", "why", "how", "which", "who", "whom", "whose", "what", "whether",
72
+ "there", "their", "they", "them", "these", "those", "then", "than", "only", "just",
73
+ "both", "either", "neither", "nor", "so", "too", "very", "even", "also", "however",
74
+ "therefore", "thus", "hence", "accordingly", "consequently", "furthermore", "moreover",
75
+ "nevertheless", "nonetheless", "otherwise", "rather", "instead", "yet", "still",
76
+ "already", "always", "never", "ever", "often", "sometimes", "usually", "generally",
77
+ "specifically", "particularly", "especially", "mainly", "mostly", "largely",
78
+ "context", "establishment", "legislative", "promoting", "justice", "human", "rights", "protections"
79
+ }
80
+
81
+ GENERIC_PHRASES = {
82
+ "effective date", "public act", "existing law", "state code", "general assembly",
83
+ "relating to", "regarding", "provide that", "provides that", "amend the", "amends the",
84
+ "this act", "the act", "state agency", "support and", "and context", "context establishment",
85
+ "legislative support", "promoting justice", "justice and", "and human", "human rights",
86
+ "rights protections", "and human rights", "justice and human", "human rights protections",
87
+ "support and context", "and context establishment", "legislative support and"
88
+ }
89
+
90
+ TFIDF_BLOCK_WORDS = {
91
+ "likely", "promote", "promotes", "promoting", "desire", "desires",
92
+ "aim", "aims", "without", "specific", "etc", "mentions", "mention",
93
+ "mentioned", "provided", "provides", "appears", "suggests", "suggest",
94
+ "driven", "purpose", "express", "referred", "uses", "use", "introduce",
95
+ "introduced", "unclear", "behind", "text", "motivation", "intent", "strategy"
96
+ }
97
+
98
+ TFIDF_BLOCK_PHRASES = {
99
+ "does specific", "provided text", "mentioned provided text", "appears procedural"
100
+ }
101
+
102
+ # -----------------------------
103
+ # CSS
104
+ # -----------------------------
105
+ st.markdown(
106
+ f"""
107
+ <style>
108
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap');
109
+
110
+ .block-container {{
111
+ padding-top: 1.2rem !important;
112
+ padding-bottom: 0.35rem !important;
113
+ padding-left: 0.8rem !important;
114
+ padding-right: 0.8rem !important;
115
+ }}
116
+
117
+ /* Hide Streamlit top decoration bar that clips content */
118
+ header[data-testid="stHeader"] {{
119
+ background: transparent !important;
120
+ height: 0rem !important;
121
+ }}
122
+ [data-testid="stToolbar"] {{
123
+ display: none !important;
124
+ }}
125
+ .main {{ background-color: #EEF2F3; }}
126
+ html, body, [class*="css"] {{ font-family: 'Inter', sans-serif; }}
127
+
128
+ .header-wrap {{
129
+ background: linear-gradient(90deg, {C_TRAPPED_DARKNESS} 0%, {C_LAZURITE_BLUE} 60%, {C_CEDAR_PLANK} 100%);
130
+ padding: 10px 14px;
131
+ border-radius: 12px;
132
+ margin: 6px 0 10px 0;
133
+ box-shadow: 0 2px 8px rgba(0,0,0,0.10);
134
+ }}
135
+ .header-title {{
136
+ color: #ffffff;
137
+ font-weight: 800;
138
+ font-size: 20px;
139
+ margin: 0;
140
+ line-height: 1.1;
141
+ }}
142
+ .header-sub {{
143
+ color: rgba(255,255,255,0.88);
144
+ font-size: 12px;
145
+ margin-top: 2px;
146
+ line-height: 1.2;
147
+ }}
148
+
149
+ .kpi-grid {{
150
+ display: grid;
151
+ grid-template-columns: 1.0fr 1.0fr 1.0fr 1.0fr 1.0fr;
152
+ gap: 10px;
153
+ margin-bottom: 10px;
154
+ }}
155
+ .kpi-card {{
156
+ background: #ffffff;
157
+ border: 1px solid #D6DEE0;
158
+ border-radius: 12px;
159
+ padding: 10px 12px;
160
+ box-shadow: 0 1px 6px rgba(0,0,0,0.06);
161
+ }}
162
+ .kpi-label {{
163
+ font-size: 11.5px;
164
+ font-weight: 650;
165
+ color: #5b6b71;
166
+ margin-bottom: 6px;
167
+ text-transform: uppercase;
168
+ letter-spacing: 0.2px;
169
+ }}
170
+ .kpi-value {{
171
+ font-size: 24px;
172
+ font-weight: 800;
173
+ color: {C_TRAPPED_DARKNESS};
174
+ line-height: 1.05;
175
+ }}
176
+
177
+ .filter-row {{
178
+ background:#ffffff;
179
+ border: 1px solid #D6DEE0;
180
+ border-radius: 12px;
181
+ padding: 8px 10px;
182
+ box-shadow: 0 1px 6px rgba(0,0,0,0.08);
183
+ margin-bottom: 10px;
184
+ }}
185
+
186
+ div[data-testid="stVerticalBlock"] > div {{ gap: 0.35rem; }}
187
+ </style>
188
+ """,
189
+ unsafe_allow_html=True,
190
+ )
191
+
192
+ # -----------------------------
193
+ # Helpers
194
+ # -----------------------------
195
+ def _find_first_existing(paths):
196
+ for p in paths:
197
+ if os.path.exists(p):
198
+ return p
199
+ return None
200
+
201
+
202
+ def load_dataset(path: str) -> pd.DataFrame:
203
+ if path.lower().endswith(".parquet"):
204
+ return pd.read_parquet(path)
205
+ if path.lower().endswith(".csv"):
206
+ return pd.read_csv(path)
207
+ raise ValueError("Unsupported file type")
208
+
209
+
210
+ def ensure_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame:
211
+ df = df.copy()
212
+ df[col] = pd.to_datetime(df[col], errors="coerce")
213
+ return df.dropna(subset=[col])
214
+
215
+
216
+ def add_time_grains(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
217
+ df = df.copy()
218
+ d = df[date_col]
219
+ df["month"] = d.dt.to_period("M").astype(str)
220
+ iso = d.dt.isocalendar()
221
+ df["week"] = iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)
222
+ df["calendar_month"] = d.dt.month # 1-12 for seasonal baseline
223
+ df["year"] = d.dt.year
224
+ return df
225
+
226
+
227
+ def pct(n, d):
228
+ return 0.0 if d == 0 else round((n / d) * 100.0, 1)
229
+
230
+
231
+ def _split_listlike(x):
232
+ if pd.isna(x):
233
+ return []
234
+ if isinstance(x, list):
235
+ parts = [str(i) for i in x]
236
+ elif isinstance(x, str):
237
+ parts = [p.strip() for p in _SPLIT_RE.split(x) if p.strip()]
238
+ else:
239
+ parts = [str(x).strip()]
240
+ return [p for p in parts if p]
241
+
242
+
243
+ def safe_col(df, col):
244
+ return col in df.columns and df[col].notna().any()
245
+
246
+
247
+ def tight_layout(fig, height=360):
248
+ fig.update_layout(
249
+ template=PLOTLY_TEMPLATE,
250
+ height=height,
251
+ margin=dict(l=8, r=8, t=8, b=8),
252
+ plot_bgcolor="white",
253
+ paper_bgcolor="white",
254
+ )
255
+ fig.update_xaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
256
+ fig.update_yaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
257
+ return fig
258
+
259
+
260
+ def build_full_period_order(start_date, end_date, grain: str):
261
+ start = pd.to_datetime(start_date)
262
+ end = pd.to_datetime(end_date)
263
+ if grain == "month":
264
+ return pd.period_range(start=start, end=end, freq="M").astype(str).tolist()
265
+ weeks = pd.date_range(start=start, end=end, freq="W-MON")
266
+ if len(weeks) == 0:
267
+ weeks = pd.date_range(start=start, end=end, freq="D")[:1]
268
+ iso = weeks.isocalendar()
269
+ return (iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)).tolist()
270
+
271
+
272
+ def explode_terms(df: pd.DataFrame, col: str, stopwords=None, min_len=3):
273
+ if col not in df.columns:
274
+ return pd.DataFrame()
275
+ stopwords = stopwords or set()
276
+ tmp = df.copy()
277
+ tmp["_term"] = tmp[col].apply(_split_listlike)
278
+ tmp = tmp.explode("_term").dropna(subset=["_term"])
279
+ term = tmp["_term"].astype(str).str.strip().str.lower()
280
+ term = term.str.replace(r"[^a-z0-9\s\-]", "", regex=True).str.strip()
281
+ tmp["term"] = term
282
+ tmp = tmp[tmp["term"].str.len() >= min_len]
283
+ tmp = tmp[~tmp["term"].isin(stopwords)]
284
+ tmp["mentions"] = 1
285
+ return tmp.drop(columns=["_term"], errors="ignore")
286
+
287
+
288
+ # ---------- TF-IDF (contrastive) ----------
289
+ def _clean_text_for_tfidf(t: str) -> str:
290
+ t = str(t).lower()
291
+ t = re.sub(r'http\S+|www\.\S+', '', t)
292
+ t = re.sub(r"[^a-z0-9\s\-]", " ", t)
293
+ t = re.sub(r"\s+", " ", t).strip()
294
+ return t
295
+
296
+
297
+ def _term_is_bad(term: str) -> bool:
298
+ term = term.strip().lower()
299
+ if not term:
300
+ return True
301
+ if term in GENERIC_PHRASES:
302
+ return True
303
+ if term in TFIDF_BLOCK_PHRASES:
304
+ return True
305
+ toks = term.split()
306
+ if len(toks) < 2 or len(toks) > 3:
307
+ return True
308
+ if any(w in TFIDF_BLOCK_WORDS for w in toks):
309
+ return True
310
+ if all((w in STOPWORDS or len(w) < 3) for w in toks):
311
+ return True
312
+ generic_words = {"relating", "regarding", "provide", "provides", "amend", "amends", "section", "subsection"}
313
+ if any(w in generic_words for w in toks):
314
+ return True
315
+ stopword_count = sum(1 for w in toks if w in STOPWORDS)
316
+ if len(toks) > 1 and stopword_count / len(toks) > 0.5:
317
+ return True
318
+ if len(toks) == 2 and any(w in ["state", "bill", "act", "law"] for w in toks):
319
+ return True
320
+ return False
321
+
322
+
323
+ def _bill_docs(df_slice: pd.DataFrame, bill_id_col: str, text_col: str):
324
+ tmp = df_slice[[bill_id_col, text_col]].dropna().copy()
325
+ tmp[text_col] = tmp[text_col].astype(str).map(_clean_text_for_tfidf)
326
+ tmp = tmp[tmp[text_col].str.len() > 15]
327
+ if tmp.empty:
328
+ return []
329
+ docs = tmp.groupby(bill_id_col)[text_col].apply(lambda s: " ".join(s.tolist())).tolist()
330
+ docs = [d for d in docs if d.strip()]
331
+ return docs
332
+
333
+
334
+ def build_contrastive_tfidf(df_cat: pd.DataFrame, df_rest: pd.DataFrame, bill_id_col: str, text_col: str, top_k=15):
335
+ if not _HAS_SK:
336
+ return []
337
+ docs_cat = _bill_docs(df_cat, bill_id_col, text_col)
338
+ docs_rest = _bill_docs(df_rest, bill_id_col, text_col)
339
+ if len(docs_cat) < 2 or len(docs_rest) < 2:
340
+ return []
341
+ vec = TfidfVectorizer(
342
+ stop_words=list(STOPWORDS),
343
+ ngram_range=(2, 3),
344
+ min_df=2,
345
+ max_df=0.35,
346
+ sublinear_tf=True,
347
+ norm="l2",
348
+ token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z\-]{3,}\b",
349
+ max_features=2000
350
+ )
351
+ all_docs = docs_cat + docs_rest
352
+ try:
353
+ X = vec.fit_transform(all_docs)
354
+ except Exception:
355
+ return []
356
+ terms = np.array(vec.get_feature_names_out())
357
+ if len(terms) == 0:
358
+ return []
359
+ X_cat = X[:len(docs_cat)]
360
+ X_rest = X[len(docs_cat):]
361
+ mean_cat = np.asarray(X_cat.mean(axis=0)).ravel()
362
+ mean_rest = np.asarray(X_rest.mean(axis=0)).ravel()
363
+ contrast = mean_cat - mean_rest
364
+ idx = np.argsort(contrast)[::-1]
365
+ out = []
366
+ seen_sets = []
367
+ min_contrast = 0.01
368
+ for i in idx:
369
+ if len(out) >= top_k:
370
+ break
371
+ if contrast[i] <= min_contrast:
372
+ break
373
+ t = terms[i]
374
+ if _term_is_bad(t):
375
+ continue
376
+ wset = set(t.split())
377
+ redundant = False
378
+ for s in seen_sets:
379
+ if len(wset & s) >= max(2, len(wset) - 1):
380
+ redundant = True
381
+ break
382
+ if redundant:
383
+ continue
384
+ seen_sets.append(wset)
385
+ out.append((t, round(float(contrast[i]), 4)))
386
+ return out
387
+
388
+
389
+ # ---------- Direction of change ----------
390
+ def classify_direction(sdf: pd.DataFrame, period_col: str, period_order: list, bill_col: str):
391
+ if sdf.empty:
392
+ return ("Stable", 0.0)
393
+ ts = (sdf.groupby(period_col)[bill_col].nunique()
394
+ .reindex(period_order, fill_value=0)
395
+ .to_numpy(dtype=float))
396
+ if ts.sum() == 0 or len(ts) < 2:
397
+ return ("Stable", 0.0)
398
+ x = np.arange(len(ts), dtype=float)
399
+ slope = float(np.polyfit(x, ts, 1)[0])
400
+ eps = 0.10
401
+ if slope > eps:
402
+ return ("Rising", slope)
403
+ if slope < -eps:
404
+ return ("Declining", slope)
405
+ return ("Stable", slope)
406
+
407
+
408
+ def short_list(items, n=3):
409
+ items = [x for x in items if x]
410
+ if not items:
411
+ return "-"
412
+ return ", ".join(items[:n]) + ("..." if len(items) > n else "")
413
+
414
+
415
+ # =====================================================
416
+ # STEP 1-8: Category Share Baseline & Z-Score Engine
417
+ # =====================================================
418
+
419
+ def compute_monthly_share_series(df_all: pd.DataFrame, category: str,
420
+ cat_col: str, bill_id_col: str, date_col: str) -> pd.DataFrame:
421
+ """
422
+ STEP 1: Compute monthly category share = bills_in_cat / total_bills for every month.
423
+ Returns DataFrame with columns: [period_month, year, calendar_month, share, total_bills, cat_bills]
424
+ Uses full dataset across all years for baseline computation.
425
+ """
426
+ df_all = df_all.copy()
427
+ df_all["_ym"] = df_all[date_col].dt.to_period("M")
428
+ df_all["_year"] = df_all[date_col].dt.year
429
+ df_all["_cal_month"] = df_all[date_col].dt.month
430
+
431
+ total_by_month = (
432
+ df_all.groupby("_ym")[bill_id_col].nunique()
433
+ .reset_index(name="total_bills")
434
+ )
435
+ cat_df = df_all[df_all[cat_col].astype(str) == str(category)]
436
+ cat_by_month = (
437
+ cat_df.groupby("_ym")[bill_id_col].nunique()
438
+ .reset_index(name="cat_bills")
439
+ )
440
+ merged = pd.merge(total_by_month, cat_by_month, on="_ym", how="left").fillna(0)
441
+ merged["share"] = (merged["cat_bills"] / merged["total_bills"] * 100.0).replace([np.inf, -np.inf], 0).fillna(0)
442
+ merged["year"] = merged["_ym"].dt.year
443
+ merged["calendar_month"] = merged["_ym"].dt.month
444
+ merged["period_str"] = merged["_ym"].astype(str)
445
+ return merged.sort_values("_ym").reset_index(drop=True)
446
+
447
+
448
+ def compute_seasonal_baseline(share_series: pd.DataFrame, lookback_years: int = 5) -> pd.DataFrame:
449
+ """
450
+ STEP 2: For each calendar month, compute 5-year mean and std of share.
451
+ Returns a dict-like DataFrame keyed by calendar_month.
452
+ """
453
+ latest_year = share_series["year"].max()
454
+ cutoff_year = latest_year - lookback_years
455
+ historical = share_series[share_series["year"] > cutoff_year].copy()
456
+ baseline = (
457
+ historical.groupby("calendar_month")["share"]
458
+ .agg(mean_share="mean", std_share="std")
459
+ .reset_index()
460
+ )
461
+ baseline["std_share"] = baseline["std_share"].fillna(0.0)
462
+ return baseline
463
+
464
+
465
+ def compute_regression_on_share(share_series: pd.DataFrame,
466
+ total_monthly: pd.DataFrame) -> dict:
467
+ """
468
+ STEP 4 & 5: Linear regression on category share and total bills over time.
469
+ Returns regression stats dict.
470
+ """
471
+ y = share_series["share"].values
472
+ x = np.arange(len(y), dtype=float)
473
+ result = {"cat_slope": 0.0, "cat_pvalue": 1.0, "cat_intercept": 0.0,
474
+ "total_slope": 0.0, "significant_growth": False}
475
+ if len(y) < 3:
476
+ return result
477
+ try:
478
+ slope, intercept, r, p, se = stats.linregress(x, y)
479
+ result["cat_slope"] = float(slope)
480
+ result["cat_intercept"] = float(intercept)
481
+ result["cat_pvalue"] = float(p)
482
+ except Exception:
483
+ pass
484
+ # Regression on total bills
485
+ if total_monthly is not None and len(total_monthly) >= 3:
486
+ try:
487
+ yt = total_monthly["total_bills"].values.astype(float)
488
+ xt = np.arange(len(yt), dtype=float)
489
+ t_slope, *_ = stats.linregress(xt, yt)
490
+ result["total_slope"] = float(t_slope)
491
+ except Exception:
492
+ pass
493
+ # STEP 5: significant growth if p<0.05 AND cat slope > total slope
494
+ result["significant_growth"] = (
495
+ result["cat_pvalue"] < 0.05 and result["cat_slope"] > result["total_slope"]
496
+ )
497
+ return result
498
+
499
+
500
+ def compute_zscore_and_residuals(share_series: pd.DataFrame,
501
+ baseline: pd.DataFrame,
502
+ reg_stats: dict) -> pd.DataFrame:
503
+ """
504
+ STEP 3 & 6 & 7: Compute Z-scores, regression predicted values, detrended residuals,
505
+ and anomaly flags (±2 SD).
506
+ """
507
+ df = share_series.merge(baseline, on="calendar_month", how="left")
508
+ # STEP 3: Z-score
509
+ df["z_score"] = np.where(
510
+ df["std_share"] > 0,
511
+ (df["share"] - df["mean_share"]) / df["std_share"],
512
+ 0.0
513
+ )
514
+ # STEP 6: Regression predicted value and residual
515
+ x = np.arange(len(df), dtype=float)
516
+ df["predicted_share"] = reg_stats["cat_intercept"] + reg_stats["cat_slope"] * x
517
+ df["residual"] = df["share"] - df["predicted_share"]
518
+ # Seasonal residual std (for band)
519
+ res_std = df["residual"].std() if len(df) > 2 else 0.0
520
+ df["upper_2sd"] = df["predicted_share"] + 2 * res_std
521
+ df["lower_2sd"] = df["predicted_share"] - 2 * res_std
522
+ # STEP 7: Anomaly flag using seasonal baseline ±2SD
523
+ df["upper_thresh"] = df["mean_share"] + 2 * df["std_share"]
524
+ df["lower_thresh"] = df["mean_share"] - 2 * df["std_share"]
525
+ df["anomaly"] = np.where(
526
+ df["share"] > df["upper_thresh"], "High",
527
+ np.where(df["share"] < df["lower_thresh"], "Low", "Normal")
528
+ )
529
+ return df
530
+
531
+
532
+
533
+ # =====================================================
534
+ # STEP 9: Subcategory Momentum
535
+ # - Short-term: iterative % change over filtered period
536
+ # - Long-term: 5-year monthly regression slope from df_full
537
+ # =====================================================
538
+
539
+ def compute_subcategory_momentum(df_cat_filtered: pd.DataFrame,
540
+ df_full_cat: pd.DataFrame,
541
+ cat_sub_col: str,
542
+ bill_id_col: str,
543
+ period_col: str,
544
+ period_order: list) -> pd.DataFrame:
545
+ """
546
+ STEP 9:
547
+ - Iterative percent change over the user-selected filtered period (short-term momentum).
548
+ - 5-year monthly regression slope computed from df_full (long-term momentum strength).
549
+ The 5-yr slope is used as the primary bar value (momentum strength).
550
+ Avg % change shown in hover as the short-term signal.
551
+ """
552
+ if not safe_col(df_cat_filtered, cat_sub_col):
553
+ return pd.DataFrame()
554
+
555
+ # --- Short-term: iterative pct change over filtered period ---
556
+ df_m = df_cat_filtered.copy()
557
+ df_m[cat_sub_col] = df_m[cat_sub_col].astype(str).str.strip()
558
+ sub_period = (
559
+ df_m.dropna(subset=[cat_sub_col])
560
+ .groupby([period_col, cat_sub_col])[bill_id_col].nunique()
561
+ .reset_index(name="bills")
562
+ )
563
+ subs = sorted(df_m[cat_sub_col].dropna().unique().tolist())
564
+ if len(period_order) < 2 or not subs or sub_period.empty:
565
+ return pd.DataFrame()
566
+
567
+ panel_short = (
568
+ sub_period.pivot_table(index=cat_sub_col, columns=period_col, values="bills", aggfunc="sum")
569
+ .reindex(index=subs, columns=period_order, fill_value=0)
570
+ )
571
+
572
+ short_term = {}
573
+ for sub in panel_short.index:
574
+ y = panel_short.loc[sub].to_numpy(dtype=float)
575
+ pct_changes = []
576
+ for i in range(1, len(y)):
577
+ prev, curr = y[i - 1], y[i]
578
+ pct_changes.append((curr - prev) / prev * 100.0 if prev > 0 else 0.0)
579
+ short_term[sub] = float(np.mean(pct_changes)) if pct_changes else 0.0
580
+
581
+ # --- Long-term: 5-year monthly regression slope from full dataset ---
582
+ long_term = {}
583
+ if df_full_cat is not None and not df_full_cat.empty and cat_sub_col in df_full_cat.columns:
584
+ df_fl = df_full_cat.copy()
585
+ df_fl[cat_sub_col] = df_fl[cat_sub_col].astype(str).str.strip()
586
+ df_fl["_ym"] = df_fl["status_date"].dt.to_period("M")
587
+ full_sub_monthly = (
588
+ df_fl.dropna(subset=[cat_sub_col])
589
+ .groupby(["_ym", cat_sub_col])[bill_id_col].nunique()
590
+ .reset_index(name="bills")
591
+ )
592
+ for sub in subs:
593
+ sub_ts = (
594
+ full_sub_monthly[full_sub_monthly[cat_sub_col] == sub]
595
+ .sort_values("_ym")
596
+ )
597
+ y_full = sub_ts["bills"].to_numpy(dtype=float)
598
+ if len(y_full) >= 3:
599
+ x_full = np.arange(len(y_full), dtype=float)
600
+ try:
601
+ slope_5yr = float(np.polyfit(x_full, y_full, 1)[0])
602
+ except Exception:
603
+ slope_5yr = 0.0
604
+ else:
605
+ slope_5yr = 0.0
606
+ long_term[sub] = slope_5yr
607
+
608
+ mom_rows = []
609
+ for sub in subs:
610
+ slope_5yr = long_term.get(sub, 0.0)
611
+ avg_pct_chg = short_term.get(sub, 0.0)
612
+ mom_rows.append((sub, slope_5yr, avg_pct_chg))
613
+
614
+ return (
615
+ pd.DataFrame(mom_rows, columns=["Subcategory", "Slope", "AvgPctChange"])
616
+ .sort_values("Slope", ascending=True)
617
+ .reset_index(drop=True)
618
+ )
619
+
620
+
621
+ # =====================================================
622
+ # STEP 8: Category Share chart with baseline band
623
+ # =====================================================
624
+
625
+ def plot_category_share_with_baseline(analysis_df: pd.DataFrame,
626
+ period_order_filter: list,
627
+ significant_growth: bool) -> go.Figure:
628
+ """
629
+ STEP 8: Plot observed share, regression baseline, ±2SD band,
630
+ and anomaly dots (red=high, blue=low).
631
+ Filtered to the user-selected date range periods.
632
+ """
633
+ plot_df = analysis_df[analysis_df["period_str"].isin(period_order_filter)].copy()
634
+ if plot_df.empty:
635
+ return go.Figure()
636
+
637
+ fig = go.Figure()
638
+
639
+ # Shaded ±2 SD band (regression-based)
640
+ fig.add_trace(go.Scatter(
641
+ x=list(plot_df["period_str"]) + list(plot_df["period_str"])[::-1],
642
+ y=list(plot_df["upper_2sd"]) + list(plot_df["lower_2sd"])[::-1],
643
+ fill="toself",
644
+ fillcolor="rgba(27,75,90,0.10)",
645
+ line=dict(color="rgba(255,255,255,0)"),
646
+ hoverinfo="skip",
647
+ name="±2 SD Band",
648
+ showlegend=True,
649
+ ))
650
+
651
+ # Seasonal mean baseline (dashed)
652
+ fig.add_trace(go.Scatter(
653
+ x=plot_df["period_str"],
654
+ y=plot_df["mean_share"],
655
+ mode="lines",
656
+ name="Seasonal Baseline (5yr mean)",
657
+ line=dict(color="#90B4BE", dash="dash", width=2),
658
+ hovertemplate="Seasonal Mean: %{y:.2f}%<extra></extra>",
659
+ ))
660
+
661
+ # Regression predicted line
662
+ fig.add_trace(go.Scatter(
663
+ x=plot_df["period_str"],
664
+ y=plot_df["predicted_share"],
665
+ mode="lines",
666
+ name="Regression Trend",
667
+ line=dict(color=C_CEDAR_PLANK, dash="dot", width=1.5),
668
+ hovertemplate="Predicted: %{y:.2f}%<extra></extra>",
669
+ ))
670
+
671
+ # Observed share (solid dark blue)
672
+ fig.add_trace(go.Scatter(
673
+ x=plot_df["period_str"],
674
+ y=plot_df["share"],
675
+ mode="lines+markers",
676
+ name="Observed Share",
677
+ line=dict(color=C_TRAPPED_DARKNESS, width=3),
678
+ marker=dict(color=C_TRAPPED_DARKNESS, size=6),
679
+ hovertemplate="<b>%{x}</b><br>Share: %{y:.2f}%<extra></extra>",
680
+ ))
681
+
682
+ # Anomaly dots — High = red, Low = blue
683
+ high_anom = plot_df[plot_df["anomaly"] == "High"]
684
+ low_anom = plot_df[plot_df["anomaly"] == "Low"]
685
+
686
+ if not high_anom.empty:
687
+ fig.add_trace(go.Scatter(
688
+ x=high_anom["period_str"],
689
+ y=high_anom["share"],
690
+ mode="markers",
691
+ name="High Anomaly (>+2 SD)",
692
+ marker=dict(color=C_PUMPING_SPICE, size=12, symbol="circle",
693
+ line=dict(color="white", width=1.5)),
694
+ hovertemplate="<b>%{x}</b><br>HIGH anomaly: %{y:.2f}%<br>Z: " +
695
+ high_anom["z_score"].round(2).astype(str) + "<extra></extra>",
696
+ ))
697
+
698
+ if not low_anom.empty:
699
+ fig.add_trace(go.Scatter(
700
+ x=low_anom["period_str"],
701
+ y=low_anom["share"],
702
+ mode="markers",
703
+ name="Low Anomaly (<-2 SD)",
704
+ marker=dict(color=C_LAZURITE_BLUE, size=12, symbol="circle",
705
+ line=dict(color="white", width=1.5)),
706
+ hovertemplate="<b>%{x}</b><br>LOW anomaly: %{y:.2f}%<br>Z: " +
707
+ low_anom["z_score"].round(2).astype(str) + "<extra></extra>",
708
+ ))
709
+
710
+ title_suffix = " ★ Significant Structural Growth" if significant_growth else ""
711
+ fig.update_layout(
712
+ template=PLOTLY_TEMPLATE,
713
+ height=420,
714
+ margin=dict(l=8, r=8, t=28, b=8),
715
+ hovermode="x unified",
716
+ yaxis_title="Share (%)",
717
+ xaxis_title="",
718
+ plot_bgcolor="white",
719
+ paper_bgcolor="white",
720
+ legend=dict(orientation="h", yanchor="bottom", y=-0.30, xanchor="center", x=0.5, font=dict(size=10)),
721
+ title=dict(text=title_suffix, font=dict(size=11, color=C_PUMPING_SPICE), x=0.5) if title_suffix else {},
722
+ )
723
+ fig.update_xaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
724
+ fig.update_yaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0",
725
+ range=[0, max(5, float(plot_df["share"].max()) * 1.25)])
726
+ return fig
727
+
728
+
729
+ # -----------------------------
730
+ # Load data
731
+ # -----------------------------
732
+ default_path = _find_first_existing(DEFAULT_CANDIDATES)
733
+
734
+ if default_path is None:
735
+ uploaded = st.file_uploader("Upload Dataset", type=["parquet", "csv"])
736
+ else:
737
+ uploaded = None
738
+
739
+ data_path = None
740
+ data_sig = "default"
741
+
742
+ if uploaded:
743
+ tmp_path = f"/tmp/{uploaded.name}"
744
+ with open(tmp_path, "wb") as f:
745
+ f.write(uploaded.getbuffer())
746
+ data_path = tmp_path
747
+ data_sig = f"{uploaded.name}-{uploaded.size}"
748
+ elif default_path:
749
+ data_path = default_path
750
+ try:
751
+ data_sig = f"{default_path}-{os.path.getmtime(default_path)}"
752
+ except Exception:
753
+ data_sig = default_path
754
+
755
+
756
+ @st.cache_data(show_spinner=True)
757
+ def _load_cached(path: str, signature: str) -> pd.DataFrame:
758
+ return load_dataset(path)
759
+
760
+
761
+ df_raw = _load_cached(data_path, data_sig) if data_path else None
762
+ if df_raw is None or df_raw.empty:
763
+ st.warning("Upload a dataset to begin.")
764
+ st.stop()
765
+
766
+ # -----------------------------
767
+ # Columns
768
+ # -----------------------------
769
+ DATE_COL = "status_date"
770
+ BILL_ID_COL = "bill_id"
771
+ CHAMBER_COL = "chamber"
772
+ CAT_MAIN = "category_main_label"
773
+ CAT_SUB = "category_sub_label"
774
+ INC_COL = "increasing_aspects_standardized"
775
+ DEC_COL = "decreasing_aspects_standardized"
776
+ BENEF_COL = "intended_beneficiaries_standardized"
777
+ IMPACT_SCORE_COL = "impact_rating_score"
778
+
779
+ KW_SOURCES = {
780
+ "Motivation": "motivation_raw",
781
+ "Intent": "intent_raw",
782
+ "Legislative Strategy": "legislative_strategy_raw",
783
+ }
784
+
785
+ required = [DATE_COL, BILL_ID_COL, CAT_MAIN]
786
+ missing = [c for c in required if c not in df_raw.columns]
787
+ if missing:
788
+ st.error(f"Missing required columns: {missing}")
789
+ st.stop()
790
+
791
+ # -----------------------------
792
+ # Engineer chamber from bill_number
793
+ # e.g. "HB1234" -> "HB", "SB5678" -> "SB"
794
+ # -----------------------------
795
+ BILL_NUM_COL = "bill_number"
796
+ df_raw = df_raw.copy()
797
+ if CHAMBER_COL not in df_raw.columns or df_raw[CHAMBER_COL].isna().all() or (df_raw[CHAMBER_COL].astype(str).str.strip() == "").all():
798
+ if BILL_NUM_COL in df_raw.columns:
799
+ df_raw[CHAMBER_COL] = (
800
+ df_raw[BILL_NUM_COL]
801
+ .astype(str)
802
+ .str.strip()
803
+ .str.extract(r"^([A-Za-z]+)", expand=False)
804
+ .str.upper()
805
+ .str[:2]
806
+ )
807
+ else:
808
+ df_raw[CHAMBER_COL] = "Unknown"
809
+
810
+ # Full dataset for baseline (all years) — use pd.Timestamp for reliable datetime comparison
811
+ df_full = ensure_datetime(df_raw, DATE_COL)
812
+ df_full = add_time_grains(df_full, DATE_COL)
813
+ df_full = df_full[
814
+ (df_full[DATE_COL] >= pd.Timestamp(DATA_START_FULL)) &
815
+ (df_full[DATE_COL] <= pd.Timestamp(DATA_END_FULL))
816
+ ].copy()
817
+
818
+ # Filtered display dataset — strictly clamped to display range
819
+ df = df_full[
820
+ (df_full[DATE_COL] >= pd.Timestamp(DATA_START)) &
821
+ (df_full[DATE_COL] <= pd.Timestamp(DATA_END))
822
+ ].copy()
823
+ df = df.dropna(subset=[CAT_MAIN]).copy()
824
+
825
+ if df.empty:
826
+ st.warning("No data in the display range (2025-01-08 to 2026-02-06).")
827
+ st.stop()
828
+
829
+ # -----------------------------
830
+ # Header
831
+ # -----------------------------
832
+ st.markdown(
833
+ """
834
+ <div class="header-wrap">
835
+ <div class="header-title">Legislative Trends Dashboard</div>
836
+ <div class="header-sub">Category share • Subcategory drivers • Policy direction • Subcategory momentum • Beneficiary × chamber distribution</div>
837
+ </div>
838
+ """,
839
+ unsafe_allow_html=True,
840
+ )
841
+
842
+ # -----------------------------
843
+ # Filters
844
+ # -----------------------------
845
+ if "clear_filters" not in st.session_state:
846
+ st.session_state.clear_filters = 0
847
+
848
+ min_date = DATA_START
849
+ max_date = DATA_END
850
+ cats_all = sorted(df[CAT_MAIN].dropna().astype(str).unique().tolist())
851
+
852
+ st.markdown("<div class='filter-row'>", unsafe_allow_html=True)
853
+ f1, f2, f3, f4, f5, f6 = st.columns([1.6, 0.9, 1.2, 2.4, 1.3, 0.7])
854
+
855
+ with f1:
856
+ date_range = st.date_input(
857
+ "Date Range",
858
+ value=(min_date, max_date),
859
+ min_value=min_date,
860
+ max_value=max_date,
861
+ key=f"date_{st.session_state.clear_filters}",
862
+ )
863
+
864
+ if isinstance(date_range, tuple) and len(date_range) == 2:
865
+ start_date, end_date = date_range
866
+ else:
867
+ start_date = date_range
868
+ end_date = date_range
869
+
870
+ with f2:
871
+ time_grain = st.radio(
872
+ "Time Grain",
873
+ ["month", "week"],
874
+ horizontal=True,
875
+ key=f"tg_{st.session_state.clear_filters}",
876
+ )
877
+ with f3:
878
+ chambers_all = ["All"] + sorted(df[CHAMBER_COL].dropna().astype(str).unique().tolist())
879
+ chambers = st.multiselect("Chamber", chambers_all, default=["All"], key=f"ch_{st.session_state.clear_filters}")
880
+ with f4:
881
+ selected_cat = st.selectbox(
882
+ "Category",
883
+ cats_all,
884
+ index=0 if cats_all else 0,
885
+ key=f"cat_{st.session_state.clear_filters}",
886
+ )
887
+ with f5:
888
+ sub_time = st.selectbox(
889
+ "Subcategory Window",
890
+ ["Overall", "Last 30 days", "Last 60 days"],
891
+ key=f"subwin_{st.session_state.clear_filters}",
892
+ )
893
+ with f6:
894
+ clear = st.button("CLEAR", use_container_width=True)
895
+
896
+ st.markdown("</div>", unsafe_allow_html=True)
897
+ if clear:
898
+ st.cache_data.clear()
899
+ st.session_state.clear_filters += 1
900
+ st.rerun()
901
+
902
+ # -----------------------------
903
+ # Apply filters
904
+ # -----------------------------
905
+ df_f = df.copy()
906
+ df_f = df_f[(df_f[DATE_COL].dt.date >= start_date) & (df_f[DATE_COL].dt.date <= end_date)]
907
+ if "All" not in chambers:
908
+ df_f = df_f[df_f[CHAMBER_COL].astype(str).isin([str(x) for x in chambers])]
909
+ df_f = df_f.dropna(subset=[CAT_MAIN])
910
+
911
+ if df_f.empty:
912
+ st.warning("No rows match your filters.")
913
+ st.stop()
914
+
915
+ tg = time_grain
916
+ period_col = tg
917
+ period_order = build_full_period_order(start_date, end_date, tg)
918
+
919
+ # -----------------------------
920
+ # KPI row
921
+ # -----------------------------
922
+ total_bills = int(df_f[BILL_ID_COL].nunique())
923
+ num_main = int(df_f[CAT_MAIN].nunique())
924
+
925
+ high_impact_bills = "-"
926
+ impact_tooltip = "High Impact = bills in the top quartile of impact_rating_score, when available."
927
+ if safe_col(df_f, IMPACT_SCORE_COL):
928
+ tmp = df_f[[BILL_ID_COL, IMPACT_SCORE_COL]].dropna()
929
+ if not tmp.empty:
930
+ bill_score = tmp.groupby(BILL_ID_COL)[IMPACT_SCORE_COL].max()
931
+ q75 = float(bill_score.quantile(0.75))
932
+ high_impact_bills = int((bill_score >= q75).sum())
933
+ else:
934
+ high_impact_bills = 0
935
+
936
+ ch_bill_counts = df_f.groupby(df_f[CHAMBER_COL].astype(str))[BILL_ID_COL].nunique()
937
+ total_ch_bills = int(ch_bill_counts.sum()) if len(ch_bill_counts) else 0
938
+ house_label = "HB" if "HB" in ch_bill_counts.index else (ch_bill_counts.index[0] if len(ch_bill_counts) else "HB")
939
+ senate_label = "SB" if "SB" in ch_bill_counts.index else (ch_bill_counts.index[1] if len(ch_bill_counts) > 1 else "SB")
940
+ house_pct = pct(int(ch_bill_counts.get(house_label, 0)), total_ch_bills) if total_ch_bills else 0.0
941
+ senate_pct = pct(int(ch_bill_counts.get(senate_label, 0)), total_ch_bills) if total_ch_bills else 0.0
942
+
943
+ st.markdown(
944
+ f"""
945
+ <div class="kpi-grid">
946
+ <div class="kpi-card">
947
+ <div class="kpi-label">Bills</div>
948
+ <div class="kpi-value">{total_bills:,}</div>
949
+ </div>
950
+ <div class="kpi-card" title="{impact_tooltip}">
951
+ <div class="kpi-label">High Impact Bills</div>
952
+ <div class="kpi-value">{high_impact_bills if high_impact_bills != "-" else "-"}</div>
953
+ </div>
954
+ <div class="kpi-card">
955
+ <div class="kpi-label">Categories</div>
956
+ <div class="kpi-value">{num_main:,}</div>
957
+ </div>
958
+ <div class="kpi-card">
959
+ <div class="kpi-label">{str(house_label)} Bills %</div>
960
+ <div class="kpi-value">{house_pct:.1f}%</div>
961
+ </div>
962
+ <div class="kpi-card">
963
+ <div class="kpi-label">{str(senate_label)} Bills %</div>
964
+ <div class="kpi-value">{senate_pct:.1f}%</div>
965
+ </div>
966
+ </div>
967
+ """,
968
+ unsafe_allow_html=True,
969
+ )
970
+
971
+ # =====================================================
972
+ # Manager Visual: Significant Category Shifts (Z-score)
973
+ # Left: ranked bar chart of categories beyond ±2σ
974
+ # Right: ranked interpretation table (directional)
975
+ # =====================================================
976
+
977
+ st.markdown("### Significant Category Shifts (vs Seasonal Baseline)")
978
+
979
+ def compute_all_category_monthly_shares(df_all: pd.DataFrame, cat_col: str, bill_id_col: str, date_col: str) -> pd.DataFrame:
980
+ """
981
+ Compute monthly share (%) for ALL categories:
982
+ share = (unique bills in category that month) / (unique bills total that month) * 100
983
+ Returns columns: [_ym, year, calendar_month, Category, cat_bills, total_bills, share, period_str]
984
+ """
985
+ tmp = df_all.copy()
986
+ tmp["_ym"] = tmp[date_col].dt.to_period("M")
987
+
988
+ total_by_month = (
989
+ tmp.groupby("_ym")[bill_id_col].nunique()
990
+ .reset_index(name="total_bills")
991
+ )
992
+
993
+ cat_by_month = (
994
+ tmp.dropna(subset=[cat_col])
995
+ .groupby(["_ym", cat_col])[bill_id_col].nunique()
996
+ .reset_index(name="cat_bills")
997
+ )
998
+
999
+ merged = cat_by_month.merge(total_by_month, on="_ym", how="left")
1000
+ merged["share"] = (merged["cat_bills"] / merged["total_bills"] * 100.0).replace([np.inf, -np.inf], 0).fillna(0)
1001
+ merged["period_str"] = merged["_ym"].astype(str)
1002
+
1003
+ # Ensure baseline keys exist
1004
+ merged["year"] = merged["_ym"].dt.year
1005
+ merged["calendar_month"] = merged["_ym"].dt.month
1006
+
1007
+ merged = merged.rename(columns={cat_col: "Category"})
1008
+ return merged.sort_values("_ym").reset_index(drop=True)
1009
+
1010
+
1011
+ def compute_category_seasonal_baseline_all(share_all: pd.DataFrame, lookback_years: int = 5) -> pd.DataFrame:
1012
+ """
1013
+ For each (Category, calendar_month), compute mean and std of share over last N years.
1014
+ Derives year/calendar_month from _ym to avoid KeyError.
1015
+ Returns: Category, calendar_month, mean_share, std_share
1016
+ """
1017
+ if share_all is None or share_all.empty:
1018
+ return pd.DataFrame(columns=["Category", "calendar_month", "mean_share", "std_share"])
1019
+
1020
+ tmp = share_all.copy()
1021
+
1022
+ # Ensure required fields exist
1023
+ if "_ym" not in tmp.columns:
1024
+ if "period_str" in tmp.columns:
1025
+ tmp["_ym"] = pd.PeriodIndex(tmp["period_str"], freq="M")
1026
+ else:
1027
+ return pd.DataFrame(columns=["Category", "calendar_month", "mean_share", "std_share"])
1028
+
1029
+ if "calendar_month" not in tmp.columns:
1030
+ tmp["calendar_month"] = tmp["_ym"].dt.month
1031
+
1032
+ # Derive year reliably from _ym
1033
+ tmp["year"] = tmp["_ym"].dt.year
1034
+
1035
+ latest_year = int(tmp["year"].max()) if not tmp.empty else 0
1036
+ cutoff_year = latest_year - lookback_years
1037
+ hist = tmp[tmp["year"] > cutoff_year].copy()
1038
+
1039
+ baseline = (
1040
+ hist.groupby(["Category", "calendar_month"])["share"]
1041
+ .agg(mean_share="mean", std_share="std")
1042
+ .reset_index()
1043
+ )
1044
+ baseline["std_share"] = baseline["std_share"].fillna(0.0)
1045
+ return baseline
1046
+
1047
+
1048
+ # --- Build monthly shares across FULL data for baseline ---
1049
+ share_all = compute_all_category_monthly_shares(df_full, CAT_MAIN, BILL_ID_COL, DATE_COL)
1050
+ baseline_all = compute_category_seasonal_baseline_all(share_all, lookback_years=5)
1051
+
1052
+ # --- Choose the "current" month for the selected time window (end_date month) ---
1053
+ target_ym = pd.to_datetime(end_date).to_period("M")
1054
+ target_period_str = str(target_ym)
1055
+
1056
+ current_month = share_all[share_all["_ym"] == target_ym].copy()
1057
+ if current_month.empty:
1058
+ st.info("No monthly data available for the selected end date month to compute Z-scores.")
1059
+ else:
1060
+ # Ensure calendar_month exists for the merge key
1061
+ if "calendar_month" not in current_month.columns:
1062
+ if "_ym" in current_month.columns:
1063
+ current_month["calendar_month"] = current_month["_ym"].dt.month
1064
+ elif "period_str" in current_month.columns:
1065
+ current_month["_ym"] = pd.PeriodIndex(current_month["period_str"], freq="M")
1066
+ current_month["calendar_month"] = current_month["_ym"].dt.month
1067
+ else:
1068
+ current_month["calendar_month"] = int(pd.to_datetime(end_date).month)
1069
+
1070
+ # Join baseline (Category x calendar_month) and compute Z-score
1071
+ current_month = current_month.merge(
1072
+ baseline_all,
1073
+ on=["Category", "calendar_month"],
1074
+ how="left"
1075
+ )
1076
+
1077
+ current_month["z_score"] = np.where(
1078
+ current_month["std_share"] > 0,
1079
+ (current_month["share"] - current_month["mean_share"]) / current_month["std_share"],
1080
+ 0.0
1081
+ )
1082
+
1083
+ # Optional % change vs baseline mean (relative)
1084
+ current_month["pct_change_vs_mean"] = np.where(
1085
+ current_month["mean_share"] > 0,
1086
+ (current_month["share"] - current_month["mean_share"]) / current_month["mean_share"] * 100.0,
1087
+ 0.0
1088
+ )
1089
+
1090
+ # Only include categories beyond ±2σ threshold
1091
+ shifts_sig = current_month[current_month["z_score"].abs() >= 2.0].copy()
1092
+
1093
+ # --- Fallback: if none exceed ±2σ, show the largest movers by |Z| (clearly labeled) ---
1094
+ show_fallback = False
1095
+ if shifts_sig.empty:
1096
+ show_fallback = True
1097
+ st.info(f"No categories exceeded ±2σ in {target_period_str}. Showing largest movers instead (not statistically significant).")
1098
+ shifts = current_month.copy()
1099
+ shifts["abs_z"] = shifts["z_score"].abs()
1100
+ shifts = shifts.sort_values("abs_z", ascending=False).head(12)
1101
+ else:
1102
+ shifts = shifts_sig.copy()
1103
+ shifts["abs_z"] = shifts["z_score"].abs()
1104
+ shifts = shifts.sort_values("abs_z", ascending=False)
1105
+
1106
+ # Color-coded: Blue above baseline, Red below baseline
1107
+ shifts["Color"] = np.where(shifts["z_score"] >= 0, "Above baseline", "Below baseline")
1108
+ color_map_shift = {"Above baseline": C_POSITIVE, "Below baseline": C_NEGATIVE}
1109
+
1110
+ left_col, right_col = st.columns([1.55, 1.0])
1111
+
1112
+ with left_col:
1113
+ st.markdown("**What’s moving the most?**")
1114
+ fig_shift = px.bar(
1115
+ shifts.iloc[::-1], # reverse so biggest appears at top in horizontal bar
1116
+ x="z_score",
1117
+ y="Category",
1118
+ orientation="h",
1119
+ color="Color",
1120
+ color_discrete_map=color_map_shift,
1121
+ template=PLOTLY_TEMPLATE,
1122
+ custom_data=["share", "mean_share", "std_share", "pct_change_vs_mean", "period_str"],
1123
+ labels={"z_score": "Z-score (σ from baseline)", "Category": ""},
1124
+ )
1125
+ fig_shift.update_traces(
1126
+ hovertemplate=(
1127
+ "<b>%{y}</b><br>"
1128
+ "Z-score: %{x:.2f}<br>"
1129
+ "Current share: %{customdata[0]:.2f}%<br>"
1130
+ "Baseline mean: %{customdata[1]:.2f}%<br>"
1131
+ "Baseline std: %{customdata[2]:.2f}<br>"
1132
+ "% change vs mean: %{customdata[3]:.1f}%<br>"
1133
+ "Month: %{customdata[4]}<extra></extra>"
1134
+ )
1135
+ )
1136
+ fig_shift = tight_layout(fig_shift, height=max(420, len(shifts) * 28 + 180))
1137
+ fig_shift.update_yaxes(showgrid=False)
1138
+ fig_shift.update_xaxes(zeroline=True, zerolinecolor="#C9D3D6")
1139
+ st.plotly_chart(fig_shift, use_container_width=True, config={"displayModeBar": False})
1140
+
1141
+ if show_fallback:
1142
+ st.caption("Largest movers shown because none crossed the ±2σ significance threshold.")
1143
+
1144
+ with right_col:
1145
+ st.markdown("**Current Significant Shifts**" if not show_fallback else "**Largest Movers (Below ±2σ)**")
1146
+
1147
+ # Directional ranking (NOT absolute):
1148
+ # - Top: largest positive deviations
1149
+ # - Bottom: largest negative deviations
1150
+ pos = shifts[shifts["z_score"] > 0].sort_values("z_score", ascending=False).copy()
1151
+ neg = shifts[shifts["z_score"] < 0].sort_values("z_score", ascending=True).copy()
1152
+
1153
+ def _mk_panel(df_part: pd.DataFrame, arrow: str):
1154
+ if df_part.empty:
1155
+ return pd.DataFrame(columns=["Category", "Direction", "Z-Score", "% Change", "Time Window"])
1156
+ out = df_part[["Category", "z_score", "pct_change_vs_mean"]].copy()
1157
+ out["Direction"] = arrow
1158
+ out["Z-Score"] = out["z_score"].round(2)
1159
+ out["% Change"] = out["pct_change_vs_mean"].round(1)
1160
+ out["Time Window"] = f"{target_period_str}"
1161
+ out = out.drop(columns=["z_score", "pct_change_vs_mean"])
1162
+ return out
1163
+
1164
+ panel_pos = _mk_panel(pos, "↑")
1165
+ panel_neg = _mk_panel(neg, "↓")
1166
+
1167
+ panel = pd.concat([panel_pos, panel_neg], axis=0).reset_index(drop=True)
1168
+ panel.insert(0, "Rank", np.arange(1, len(panel) + 1))
1169
+
1170
+ st.dataframe(panel, use_container_width=True, height=380)
1171
+ st.caption("Directional ranking: accelerators (↑) first, contractions (↓) last. Threshold: |Z| ≥ 2 (fallback shows top movers if none qualify).")
1172
+
1173
+ # -----------------------------
1174
+ # Category ranking
1175
+ # -----------------------------
1176
+ st.markdown("### Category Ranking")
1177
+ cat_rank = (
1178
+ df_f.groupby(CAT_MAIN)[BILL_ID_COL].nunique()
1179
+ .sort_values(ascending=False)
1180
+ .reset_index(name="Bills")
1181
+ .head(20)
1182
+ )
1183
+
1184
+ cat_hover_dir = []
1185
+ cat_hover_impact = []
1186
+ for cat in cat_rank[CAT_MAIN].astype(str).tolist():
1187
+ sdf = df_f[df_f[CAT_MAIN].astype(str) == str(cat)].copy()
1188
+ direction, _slope = classify_direction(sdf, period_col, period_order, BILL_ID_COL)
1189
+ if safe_col(sdf, IMPACT_SCORE_COL):
1190
+ bmax = sdf[[BILL_ID_COL, IMPACT_SCORE_COL]].dropna().groupby(BILL_ID_COL)[IMPACT_SCORE_COL].max()
1191
+ avg_imp = float(bmax.mean()) if len(bmax) else float("nan")
1192
+ else:
1193
+ avg_imp = float("nan")
1194
+ cat_hover_dir.append(direction)
1195
+ cat_hover_impact.append(None if np.isnan(avg_imp) else round(avg_imp, 2))
1196
+
1197
+ cat_rank2 = cat_rank.copy()
1198
+ cat_rank2["Direction"] = cat_hover_dir
1199
+ cat_rank2["AvgImpact"] = cat_hover_impact
1200
+
1201
+ fig_rank = px.bar(
1202
+ cat_rank2.iloc[::-1],
1203
+ x="Bills",
1204
+ y=CAT_MAIN,
1205
+ orientation="h",
1206
+ labels={"Bills": "Bills", CAT_MAIN: ""},
1207
+ template=PLOTLY_TEMPLATE,
1208
+ custom_data=["Direction", "AvgImpact"],
1209
+ )
1210
+ fig_rank.update_traces(
1211
+ marker_color=C_LAZURITE_BLUE,
1212
+ hovertemplate=(
1213
+ "<b>%{y}</b><br>"
1214
+ "Bills: %{x}<br>"
1215
+ "Direction: %{customdata[0]}<br>"
1216
+ "Avg Political Impact: %{customdata[1]}<extra></extra>"
1217
+ )
1218
+ )
1219
+ fig_rank = tight_layout(fig_rank, height=420)
1220
+ fig_rank.update_yaxes(showgrid=False)
1221
+ st.plotly_chart(fig_rank, use_container_width=True, config={"displayModeBar": False})
1222
+
1223
+ # =====================================================
1224
+ # Row 1: Category Share (with Baseline) + Subcategory Drivers
1225
+ # =====================================================
1226
+ df_cat = df_f[df_f[CAT_MAIN].astype(str) == str(selected_cat)].copy()
1227
+
1228
+ # --- STEP 1-8: Compute full share series on entire df_full for baseline ---
1229
+ share_series_full = compute_monthly_share_series(
1230
+ df_full, selected_cat, CAT_MAIN, BILL_ID_COL, DATE_COL
1231
+ )
1232
+ seasonal_baseline = compute_seasonal_baseline(share_series_full, lookback_years=5)
1233
+
1234
+ total_monthly_full = (
1235
+ df_full.groupby(df_full[DATE_COL].dt.to_period("M"))[BILL_ID_COL]
1236
+ .nunique()
1237
+ .reset_index(name="total_bills")
1238
+ )
1239
+ reg_stats = compute_regression_on_share(share_series_full, total_monthly_full)
1240
+ analysis_df = compute_zscore_and_residuals(share_series_full, seasonal_baseline, reg_stats)
1241
+
1242
+ # Build period order strings for filter alignment (months only for baseline chart)
1243
+ month_period_order = build_full_period_order(start_date, end_date, "month")
1244
+
1245
+ # --- Subcategory section ---
1246
+ df_sub = df_cat.copy()
1247
+ cutoff = None
1248
+ if sub_time != "Overall":
1249
+ days = 30 if sub_time == "Last 30 days" else 60
1250
+ cutoff = (pd.to_datetime(end_date) - pd.Timedelta(days=days)).date()
1251
+ df_sub = df_sub[df_sub[DATE_COL].dt.date >= cutoff]
1252
+
1253
+ emerging_map = {}
1254
+ if cutoff is not None and safe_col(df_cat, CAT_SUB):
1255
+ before = df_cat[df_cat[DATE_COL].dt.date < cutoff]
1256
+ before_set = set(before[CAT_SUB].dropna().astype(str).unique().tolist())
1257
+ window_set = set(df_sub[CAT_SUB].dropna().astype(str).unique().tolist())
1258
+ for s in window_set:
1259
+ emerging_map[s] = (s not in before_set)
1260
+
1261
+ sub_ct = pd.DataFrame()
1262
+ if CAT_SUB in df_sub.columns and not df_sub[CAT_SUB].isna().all():
1263
+ sub_ct = (
1264
+ df_sub.dropna(subset=[CAT_SUB])
1265
+ .groupby(CAT_SUB)[BILL_ID_COL].nunique()
1266
+ .reset_index(name="Bills")
1267
+ .sort_values("Bills", ascending=False)
1268
+ .head(12)
1269
+ )
1270
+
1271
+ hover_dir = []
1272
+ hover_impact = []
1273
+ for sub in sub_ct[CAT_SUB].astype(str).tolist():
1274
+ sdf = df_sub[df_sub[CAT_SUB].astype(str) == str(sub)].copy()
1275
+ direction, _slope = classify_direction(sdf, period_col, period_order, BILL_ID_COL)
1276
+ if cutoff is not None and emerging_map.get(sub, False):
1277
+ direction = "Emerging"
1278
+ if safe_col(sdf, IMPACT_SCORE_COL):
1279
+ bmax = sdf[[BILL_ID_COL, IMPACT_SCORE_COL]].dropna().groupby(BILL_ID_COL)[IMPACT_SCORE_COL].max()
1280
+ avg_imp = float(bmax.mean()) if len(bmax) else float("nan")
1281
+ else:
1282
+ avg_imp = float("nan")
1283
+ hover_dir.append(direction)
1284
+ hover_impact.append(None if np.isnan(avg_imp) else round(avg_imp, 2))
1285
+
1286
+ sub_ct2 = sub_ct.copy()
1287
+ sub_ct2["Direction"] = hover_dir
1288
+ sub_ct2["AvgImpact"] = hover_impact
1289
+
1290
+ r1a, r1b = st.columns(2)
1291
+
1292
+ with r1a:
1293
+ st.markdown("### Category Share Over Time")
1294
+ fig_share = plot_category_share_with_baseline(
1295
+ analysis_df, month_period_order, reg_stats["significant_growth"]
1296
+ )
1297
+ if fig_share.data:
1298
+ st.plotly_chart(fig_share, use_container_width=True, config={"displayModeBar": False})
1299
+ else:
1300
+ st.info("No share data available for this selection.")
1301
+
1302
+ with r1b:
1303
+ st.markdown("### Subcategory Drivers")
1304
+ if sub_ct2.empty:
1305
+ st.info("No subcategory data available for this selection/window.")
1306
+ else:
1307
+ show = sub_ct2.sort_values("Bills", ascending=True)
1308
+ fig = px.bar(
1309
+ show,
1310
+ x="Bills",
1311
+ y=CAT_SUB,
1312
+ orientation="h",
1313
+ template=PLOTLY_TEMPLATE,
1314
+ labels={"Bills": "Bills", CAT_SUB: ""},
1315
+ custom_data=["Direction", "AvgImpact"],
1316
+ )
1317
+ fig.update_traces(
1318
+ marker_color=C_PUMPING_SPICE,
1319
+ hovertemplate=(
1320
+ "<b>%{y}</b><br>"
1321
+ "Bills: %{x}<br>"
1322
+ "Direction: %{customdata[0]}<br>"
1323
+ "Avg Political Impact: %{customdata[1]}<extra></extra>"
1324
+ ),
1325
+ )
1326
+ fig = tight_layout(fig, height=420)
1327
+ fig.update_yaxes(showgrid=False)
1328
+ st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
1329
+
1330
+ # -----------------------------
1331
+ # Row 2: Policy Direction + TF-IDF
1332
+ # -----------------------------
1333
+ r2a, r2b = st.columns([1.55, 1.0])
1334
+
1335
+ inc_terms = explode_terms(df_f, INC_COL, stopwords=STOPWORDS, min_len=3)
1336
+ dec_terms = explode_terms(df_f, DEC_COL, stopwords=STOPWORDS, min_len=3)
1337
+
1338
+ if not inc_terms.empty:
1339
+ inc_terms["_period"] = inc_terms[period_col]
1340
+ if not dec_terms.empty:
1341
+ dec_terms["_period"] = dec_terms[period_col]
1342
+
1343
+ inc_ts = inc_terms.groupby([period_col])["mentions"].sum().reindex(period_order, fill_value=0).reset_index()
1344
+ inc_ts.columns = [period_col, "inc"]
1345
+ dec_ts = dec_terms.groupby([period_col])["mentions"].sum().reindex(period_order, fill_value=0).reset_index()
1346
+ dec_ts.columns = [period_col, "dec"]
1347
+
1348
+ net_ts = pd.merge(inc_ts, dec_ts, on=period_col, how="left").fillna(0)
1349
+ net_ts["net"] = net_ts["inc"] - net_ts["dec"]
1350
+
1351
+ df_inc_rows = df_f[df_f[INC_COL].notna()].copy() if INC_COL in df_f.columns else pd.DataFrame(columns=df_f.columns)
1352
+ df_dec_rows = df_f[df_f[DEC_COL].notna()].copy() if DEC_COL in df_f.columns else pd.DataFrame(columns=df_f.columns)
1353
+ df_inc_rows["_period"] = df_inc_rows[period_col] if not df_inc_rows.empty else []
1354
+ df_dec_rows["_period"] = df_dec_rows[period_col] if not df_dec_rows.empty else []
1355
+
1356
+
1357
+ def top_keywords_for_period(term_df: pd.DataFrame, period_value, k=6) -> pd.DataFrame:
1358
+ if term_df is None or term_df.empty:
1359
+ return pd.DataFrame()
1360
+ sub = term_df[term_df["_period"] == period_value]
1361
+ if sub.empty:
1362
+ return pd.DataFrame()
1363
+ vc = sub["term"].value_counts().head(k).reset_index()
1364
+ vc.columns = ["Keyword", "Mentions"]
1365
+ return vc
1366
+
1367
+
1368
+ def top_beneficiaries_for_period(df_rows: pd.DataFrame, period_value, benef_col: str, k=6) -> pd.DataFrame:
1369
+ if benef_col not in df_rows.columns or df_rows.empty:
1370
+ return pd.DataFrame()
1371
+ sub = df_rows[df_rows["_period"] == period_value].copy()
1372
+ if sub.empty or sub[benef_col].dropna().empty:
1373
+ return pd.DataFrame()
1374
+ sub["_b"] = sub[benef_col].apply(_split_listlike)
1375
+ sub = sub.explode("_b").dropna(subset=["_b"])
1376
+ sub["_b"] = sub["_b"].astype(str).str.strip()
1377
+ sub = sub[sub["_b"].str.len() > 0]
1378
+ if sub.empty:
1379
+ return pd.DataFrame()
1380
+ vc = sub["_b"].value_counts().head(k).reset_index()
1381
+ vc.columns = ["Beneficiary", "Mentions"]
1382
+ return vc
1383
+
1384
+
1385
+ inc_kw_short, dec_kw_short, inc_b_short, dec_b_short = [], [], [], []
1386
+ for p in net_ts[period_col].tolist():
1387
+ inc_kw = top_keywords_for_period(inc_terms, p, k=6)
1388
+ dec_kw = top_keywords_for_period(dec_terms, p, k=6)
1389
+ inc_b = top_beneficiaries_for_period(df_inc_rows, p, BENEF_COL, k=6) if safe_col(df_f, BENEF_COL) else pd.DataFrame()
1390
+ dec_b = top_beneficiaries_for_period(df_dec_rows, p, BENEF_COL, k=6) if safe_col(df_f, BENEF_COL) else pd.DataFrame()
1391
+ inc_kw_short.append(short_list(inc_kw["Keyword"].tolist() if not inc_kw.empty else [], 3))
1392
+ dec_kw_short.append(short_list(dec_kw["Keyword"].tolist() if not dec_kw.empty else [], 3))
1393
+ inc_b_short.append(short_list(inc_b["Beneficiary"].tolist() if not inc_b.empty else [], 2))
1394
+ dec_b_short.append(short_list(dec_b["Beneficiary"].tolist() if not dec_b.empty else [], 2))
1395
+
1396
+ net_ts["inc_kw_short"] = inc_kw_short
1397
+ net_ts["dec_kw_short"] = dec_kw_short
1398
+ net_ts["inc_b_short"] = inc_b_short
1399
+ net_ts["dec_b_short"] = dec_b_short
1400
+
1401
+ with r2a:
1402
+ st.markdown("### Policy Direction Over Time")
1403
+ if net_ts.empty or (net_ts["inc"].sum() == 0 and net_ts["dec"].sum() == 0):
1404
+ st.info("No increasing/decreasing aspects available under current filters.")
1405
+ else:
1406
+ custom = np.stack(
1407
+ [
1408
+ net_ts["inc_kw_short"].astype(str),
1409
+ net_ts["dec_kw_short"].astype(str),
1410
+ net_ts["inc_b_short"].astype(str),
1411
+ net_ts["dec_b_short"].astype(str),
1412
+ ],
1413
+ axis=1,
1414
+ )
1415
+ fig = go.Figure()
1416
+ fig.add_trace(go.Bar(
1417
+ x=net_ts[period_col],
1418
+ y=net_ts["inc"],
1419
+ name="Increasing",
1420
+ marker_color=C_POSITIVE,
1421
+ customdata=custom,
1422
+ hovertemplate="<b>%{x}</b><br>Increasing: %{y}<br>Keywords: %{customdata[0]}<br>Beneficiaries: %{customdata[2]}<extra></extra>",
1423
+ ))
1424
+ fig.add_trace(go.Bar(
1425
+ x=net_ts[period_col],
1426
+ y=-net_ts["dec"],
1427
+ name="Decreasing",
1428
+ marker_color=C_NEGATIVE,
1429
+ customdata=custom,
1430
+ hovertemplate="<b>%{x}</b><br>Decreasing: %{y:.0f}<br>Keywords: %{customdata[1]}<br>Beneficiaries: %{customdata[3]}<extra></extra>",
1431
+ ))
1432
+ fig.add_trace(go.Scatter(
1433
+ x=net_ts[period_col],
1434
+ y=net_ts["net"],
1435
+ mode="lines+markers",
1436
+ name="Net",
1437
+ line=dict(color=C_TRAPPED_DARKNESS, width=2),
1438
+ hovertemplate="<b>%{x}</b><br>Net: %{y}<extra></extra>",
1439
+ ))
1440
+ fig.update_layout(
1441
+ template=PLOTLY_TEMPLATE,
1442
+ barmode="relative",
1443
+ height=420,
1444
+ margin=dict(l=8, r=8, t=8, b=8),
1445
+ hovermode="x unified",
1446
+ legend=dict(orientation="h", yanchor="bottom", y=-0.22, xanchor="center", x=0.5),
1447
+ yaxis_title="Mentions",
1448
+ xaxis_title="",
1449
+ plot_bgcolor="white",
1450
+ paper_bgcolor="white",
1451
+ )
1452
+ fig.update_xaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
1453
+ fig.update_yaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0",
1454
+ zeroline=True, zerolinecolor="#C9D3D6")
1455
+ st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
1456
+
1457
+ with r2b:
1458
+ st.markdown("### Top Keywords")
1459
+ tfidf_mode = st.selectbox(
1460
+ "TF-IDF Source",
1461
+ ["Motivation", "Intent", "Legislative Strategy"],
1462
+ index=0
1463
+ )
1464
+ tfidf_text_col = KW_SOURCES[tfidf_mode]
1465
+ if not safe_col(df_f, tfidf_text_col):
1466
+ st.info(f"Column `{tfidf_text_col}` not available.")
1467
+ else:
1468
+ df_rest = df_f[df_f[CAT_MAIN].astype(str) != str(selected_cat)].copy()
1469
+ tf_phrases = build_contrastive_tfidf(df_cat, df_rest, BILL_ID_COL, tfidf_text_col, top_k=15)
1470
+ if not tf_phrases:
1471
+ st.info("TF-IDF returned no meaningful category-distinct phrases for this slice.")
1472
+ else:
1473
+ kw_tbl = pd.DataFrame(tf_phrases, columns=["Keyword", "Distinctiveness Score"])
1474
+ kw_tbl.index = np.arange(1, len(kw_tbl) + 1)
1475
+ st.dataframe(kw_tbl, use_container_width=True, height=300)
1476
+
1477
+ # =====================================================
1478
+ # Row 3: Subcategory Momentum (STEP 9) + Heatmap (STEP 10)
1479
+ # =====================================================
1480
+ st.markdown("")
1481
+ r3a, r3b = st.columns(2)
1482
+
1483
+ with r3a:
1484
+ st.markdown("### Subcategory Momentum")
1485
+
1486
+ # df_full_cat: full historical data for selected category (for 5-yr slope)
1487
+ df_full_cat = df_full[df_full[CAT_MAIN].astype(str) == str(selected_cat)].copy()
1488
+
1489
+ # STEP 9: pass both filtered df_cat (short-term pct change) and df_full_cat (5-yr slope)
1490
+ sub_momentum = compute_subcategory_momentum(
1491
+ df_cat, df_full_cat, CAT_SUB, BILL_ID_COL, period_col, period_order
1492
+ )
1493
+
1494
+ if sub_momentum.empty:
1495
+ st.info("Not enough data to compute momentum.")
1496
+ else:
1497
+ # SlopeScaled = 5-year regression slope * 100 (momentum strength)
1498
+ sub_momentum["SlopeScaled"] = sub_momentum["Slope"] * 100.0
1499
+ sub_momentum["SlopeScaled"] = pd.to_numeric(sub_momentum["SlopeScaled"], errors="coerce").fillna(0.0)
1500
+
1501
+ eps = 1e-4
1502
+ sub_momentum["Direction"] = np.where(
1503
+ sub_momentum["SlopeScaled"] > eps, "Rising",
1504
+ np.where(sub_momentum["SlopeScaled"] < -eps, "Falling", "Stable")
1505
+ )
1506
+
1507
+ # Show top movers by absolute slope + top stable
1508
+ movers = sub_momentum[sub_momentum["Direction"] != "Stable"].copy()
1509
+ stable = sub_momentum[sub_momentum["Direction"] == "Stable"].copy()
1510
+ movers = movers.reindex(movers["SlopeScaled"].abs().sort_values(ascending=False).index).head(10)
1511
+ stable = stable.head(5)
1512
+ show_df = pd.concat([movers, stable], axis=0).drop_duplicates("Subcategory")
1513
+ if show_df.empty:
1514
+ show_df = sub_momentum.head(12).copy()
1515
+
1516
+ # Ensure bars are always visible (min visible length)
1517
+ min_visible = 0.20
1518
+ show_df = show_df.copy()
1519
+ show_df["DisplaySlope"] = show_df["SlopeScaled"]
1520
+ near_zero = show_df["DisplaySlope"].abs() < min_visible
1521
+ show_df.loc[near_zero & (show_df["Direction"] == "Rising"), "DisplaySlope"] = min_visible
1522
+ show_df.loc[near_zero & (show_df["Direction"] == "Falling"), "DisplaySlope"] = -min_visible
1523
+ show_df.loc[near_zero & (show_df["Direction"] == "Stable"), "DisplaySlope"] = min_visible * 0.6
1524
+ show_df = show_df.sort_values("DisplaySlope", ascending=True)
1525
+
1526
+ show_df["AvgPctChange"] = pd.to_numeric(show_df["AvgPctChange"], errors="coerce").fillna(0.0)
1527
+ show_df["SlopeScaled"] = pd.to_numeric(show_df["SlopeScaled"], errors="coerce").fillna(0.0)
1528
+
1529
+
1530
+ color_map = {"Rising": C_POSITIVE, "Falling": C_NEGATIVE, "Stable": C_STABLE}
1531
+ fig = px.bar(
1532
+ show_df,
1533
+ x="DisplaySlope",
1534
+ y="Subcategory",
1535
+ color="Direction",
1536
+ orientation="h",
1537
+ color_discrete_map=color_map,
1538
+ template=PLOTLY_TEMPLATE,
1539
+ custom_data=["SlopeScaled", "Direction", "AvgPctChange"],
1540
+ labels={"DisplaySlope": "5-Yr Momentum Slope (x100)", "Subcategory": ""},
1541
+ )
1542
+ fig.update_traces(
1543
+ hovertemplate=(
1544
+ "<b>%{y}</b><br>"
1545
+ "Direction: %{customdata[1]}<br>"
1546
+ "5-Yr Regression Slope (x100): %{customdata[0]:.3f}<br>"
1547
+ "Short-Term Avg % Change: %{customdata[2]:.1f}%<extra></extra>"
1548
+ )
1549
+ )
1550
+ max_abs = float(np.nanmax(np.abs(show_df["DisplaySlope"].to_numpy()))) if len(show_df) else 1.0
1551
+ max_abs = max(max_abs, 1.0)
1552
+ fig.update_layout(
1553
+ template=PLOTLY_TEMPLATE,
1554
+ height=520,
1555
+ margin=dict(l=8, r=8, t=8, b=8),
1556
+ xaxis_title="5-Yr Momentum Slope (x100)",
1557
+ yaxis_title="",
1558
+ plot_bgcolor="white",
1559
+ paper_bgcolor="white",
1560
+ legend=dict(orientation="h", yanchor="bottom", y=-0.22, xanchor="center", x=0.5),
1561
+ barmode="relative",
1562
+ )
1563
+ fig.update_xaxes(
1564
+ range=[-max_abs * 1.15, max_abs * 1.15],
1565
+ showgrid=True, gridcolor="#EDF2F4",
1566
+ showline=True, linecolor="#D6DEE0",
1567
+ zeroline=True, zerolinecolor="#C9D3D6",
1568
+ )
1569
+ fig.update_yaxes(showgrid=False)
1570
+ st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
1571
+
1572
+ with r3b:
1573
+ # -------------------------------------------------------
1574
+ # STEP 10: Heatmap — Intended Beneficiaries × Increasing Aspects
1575
+ # Filter: top 10 categories with bill count > K threshold
1576
+ # Remove empty/very low-density cells
1577
+ # Conditional formatting: annotate each cell with actual bill count
1578
+ # -------------------------------------------------------
1579
+ st.markdown("### Beneficiaries × Increasing Aspects")
1580
+
1581
+ hc1, hc2 = st.columns(2)
1582
+ with hc1:
1583
+ cat_bill_thresh = st.slider(
1584
+ "Min Bills per Category", min_value=5, max_value=100, value=20, step=5,
1585
+ help="Only include categories with at least this many unique bills"
1586
+ )
1587
+ with hc2:
1588
+ min_cell_thresh = st.slider(
1589
+ "Min Bills per Cell", min_value=1, max_value=15, value=2, step=1,
1590
+ help="Remove cells with fewer than this many bills (sparse cell filter)"
1591
+ )
1592
+ topN_benef = st.slider("Top N Beneficiaries", min_value=5, max_value=25, value=10, step=1)
1593
+
1594
+ if not safe_col(df_f, BENEF_COL) or not safe_col(df_f, INC_COL):
1595
+ st.caption("Beneficiary or Increasing Aspects data not available.")
1596
+ else:
1597
+ # STEP 10a: Top 10 categories, filtered by bill count > K threshold
1598
+ cat_counts = df_f.groupby(CAT_MAIN)[BILL_ID_COL].nunique()
1599
+ eligible_cats = cat_counts[cat_counts >= cat_bill_thresh].sort_values(ascending=False)
1600
+
1601
+ if eligible_cats.empty:
1602
+ st.caption(f"No categories have ≥ {cat_bill_thresh} bills. Try lowering the threshold.")
1603
+ else:
1604
+ top10_cats = eligible_cats.head(10).index.tolist()
1605
+ df_heat = df_f[df_f[CAT_MAIN].isin(top10_cats)].copy()
1606
+
1607
+ # Explode beneficiaries
1608
+ df_heat["_benef"] = df_heat[BENEF_COL].apply(_split_listlike)
1609
+ df_heat = df_heat.explode("_benef").dropna(subset=["_benef"])
1610
+ df_heat["_benef"] = df_heat["_benef"].astype(str).str.strip()
1611
+ df_heat = df_heat[df_heat["_benef"].str.len() > 0]
1612
+
1613
+ # Explode increasing aspects and clean
1614
+ df_heat["_inc"] = df_heat[INC_COL].apply(_split_listlike)
1615
+ df_heat = df_heat.explode("_inc").dropna(subset=["_inc"])
1616
+ df_heat["_inc"] = df_heat["_inc"].astype(str).str.strip().str.lower()
1617
+ df_heat["_inc"] = df_heat["_inc"].str.replace(r"[^a-z0-9\s\-]", "", regex=True).str.strip()
1618
+ df_heat = df_heat[df_heat["_inc"].str.len() >= 3]
1619
+ df_heat = df_heat[~df_heat["_inc"].isin(STOPWORDS)]
1620
+
1621
+ if df_heat.empty:
1622
+ st.caption("No usable beneficiary × increasing aspects data.")
1623
+ else:
1624
+ # Keep top N beneficiaries and top 15 increasing aspect terms
1625
+ top_benef = df_heat["_benef"].value_counts().head(topN_benef).index.tolist()
1626
+ top_inc = df_heat["_inc"].value_counts().head(15).index.tolist()
1627
+
1628
+ df_heat = df_heat[
1629
+ df_heat["_benef"].isin(top_benef) &
1630
+ df_heat["_inc"].isin(top_inc)
1631
+ ].copy()
1632
+
1633
+ benef_heat = (
1634
+ df_heat.groupby(["_benef", "_inc"])[BILL_ID_COL].nunique()
1635
+ .reset_index(name="bills")
1636
+ )
1637
+
1638
+ # STEP 10b: Remove empty / very low-density cells (< min_cell_thresh)
1639
+ benef_heat = benef_heat[benef_heat["bills"] >= min_cell_thresh]
1640
+
1641
+ if benef_heat.empty:
1642
+ st.caption(f"No cells with ≥ {min_cell_thresh} bills. Try lowering the threshold.")
1643
+ else:
1644
+ pivot = benef_heat.pivot(index="_benef", columns="_inc", values="bills").fillna(0)
1645
+
1646
+ # Sort rows and columns by total density (highest at top/left)
1647
+ pivot = pivot.loc[
1648
+ pivot.sum(axis=1).sort_values(ascending=False).index,
1649
+ pivot.sum(axis=0).sort_values(ascending=False).index
1650
+ ]
1651
+
1652
+ z_actual = pivot.values.astype(float)
1653
+ # STEP 10c: log-scale for color (handles outliers gracefully)
1654
+ z_scaled = np.log1p(z_actual)
1655
+
1656
+ # STEP 10d: Conditional formatting — annotate each cell with actual count
1657
+ # Only show text for cells above the sparse threshold (already filtered)
1658
+ annotations = []
1659
+ for i, row_label in enumerate(pivot.index):
1660
+ for j, col_label in enumerate(pivot.columns):
1661
+ val = int(z_actual[i, j])
1662
+ if val > 0:
1663
+ # White text for dark cells, dark for light cells
1664
+ max_val = z_scaled.max() if z_scaled.max() > 0 else 1
1665
+ brightness = z_scaled[i, j] / max_val
1666
+ font_color = "white" if brightness > 0.55 else C_TRAPPED_DARKNESS
1667
+ annotations.append(
1668
+ dict(
1669
+ x=col_label,
1670
+ y=row_label,
1671
+ text=str(val),
1672
+ showarrow=False,
1673
+ font=dict(color=font_color, size=9),
1674
+ xref="x", yref="y",
1675
+ )
1676
+ )
1677
+
1678
+ fig = go.Figure(data=go.Heatmap(
1679
+ z=z_scaled,
1680
+ x=pivot.columns.astype(str).tolist(),
1681
+ y=pivot.index.astype(str).tolist(),
1682
+ colorscale=[
1683
+ [0.0, "#F2F5F6"],
1684
+ [0.25, "#C8D9DE"],
1685
+ [0.5, "#7FAAB7"],
1686
+ [0.75, "#3D7285"],
1687
+ [1.0, C_LAZURITE_BLUE],
1688
+ ],
1689
+ colorbar=dict(
1690
+ title="log(1+bills)",
1691
+ tickfont=dict(size=10),
1692
+ thickness=12,
1693
+ len=0.8,
1694
+ ),
1695
+ customdata=z_actual,
1696
+ hovertemplate=(
1697
+ "Beneficiary: %{y}<br>"
1698
+ "Aspect: %{x}<br>"
1699
+ "Unique Bills: %{customdata:.0f}<extra></extra>"
1700
+ ),
1701
+ xgap=1,
1702
+ ygap=1,
1703
+ ))
1704
+ fig.update_layout(
1705
+ template=PLOTLY_TEMPLATE,
1706
+ height=max(520, len(pivot.index) * 30 + 120),
1707
+ margin=dict(l=8, r=8, t=8, b=80),
1708
+ xaxis_title="Increasing Aspect",
1709
+ yaxis_title="",
1710
+ plot_bgcolor="white",
1711
+ paper_bgcolor="white",
1712
+ xaxis=dict(tickangle=-40, tickfont=dict(size=10)),
1713
+ yaxis=dict(tickfont=dict(size=10)),
1714
+ annotations=annotations,
1715
+ )
1716
+ st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
dockerignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **/.git
2
+ **/__pycache__
3
+ *.pyc
4
+ *.pkl
5
+ *.joblib
6
+ *.pt
7
+ *.bin
8
+ *.zip
9
+ *.tar
10
+ *.gz
11
+ notebooks/
12
+ outputs/
13
+ data/
features_standardized_11_renamed.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edc651cba51bc217650a48a8c4d66d9e329f19711779ad69e851816d057852c2
3
+ size 538177112
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ streamlit==1.37.1
3
+ pandas==2.2.2
4
+ numpy==1.26.4
5
+ pyarrow==17.0.0
6
+ plotly==5.23.0
7
+ scikit-learn==1.4.2
8
+
utils.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import ast
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ DATE_COL = "status_date"
7
+
8
+ # Keep only what we need for the dashboard (cuts memory a LOT)
9
+ NEEDED_COLS = [
10
+ "bill_id",
11
+ "session",
12
+ "chamber",
13
+ "bill_number",
14
+ "status_date",
15
+
16
+ "policy_domain_standardized",
17
+ "category_main_label",
18
+ "category_sub_label",
19
+
20
+ "intent_standardized",
21
+ "policy_direction_classifications",
22
+
23
+ "category_main_keywords",
24
+ "category_sub_keywords",
25
+ "category_main_llama_summary_keywords",
26
+ "category_sub_llama_summary_keywords",
27
+
28
+ "legislative_goal_standardized",
29
+ "impact_rating_standardized",
30
+ "impact_rating_score",
31
+ ]
32
+
33
+ KEYWORD_COLS = [
34
+ "category_main_keywords",
35
+ "category_sub_keywords",
36
+ "category_main_llama_summary_keywords",
37
+ "category_sub_llama_summary_keywords",
38
+ ]
39
+
40
+
41
+ def _safe_listify(x):
42
+ """Turn list-like cells or strings into list[str]."""
43
+ if x is None:
44
+ return []
45
+ if isinstance(x, float) and np.isnan(x):
46
+ return []
47
+ if isinstance(x, list):
48
+ return [str(i).strip() for i in x if str(i).strip()]
49
+
50
+ s = str(x).strip()
51
+ if not s or s.lower() in {"nan", "none", "null"}:
52
+ return []
53
+
54
+ if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
55
+ try:
56
+ parsed = ast.literal_eval(s)
57
+ if isinstance(parsed, (list, tuple, set)):
58
+ return [str(i).strip() for i in parsed if str(i).strip()]
59
+ except Exception:
60
+ pass
61
+
62
+ parts = re.split(r"[,\|;]\s*", s)
63
+ return [p.strip() for p in parts if p.strip()]
64
+
65
+
66
+ def load_dataset(path: str) -> pd.DataFrame:
67
+ if path.lower().endswith(".parquet"):
68
+ all_cols = pd.read_parquet(path, engine="pyarrow").columns
69
+ cols = [c for c in NEEDED_COLS if c in all_cols]
70
+ df = pd.read_parquet(path, columns=cols)
71
+ elif path.lower().endswith(".csv"):
72
+ # for csv, we can't cheaply read columns list; just try usecols and fallback
73
+ try:
74
+ df = pd.read_csv(path, usecols=NEEDED_COLS)
75
+ except Exception:
76
+ df = pd.read_csv(path)
77
+ df = df[[c for c in NEEDED_COLS if c in df.columns]]
78
+ else:
79
+ raise ValueError("Supported formats: .parquet or .csv")
80
+
81
+ if DATE_COL not in df.columns:
82
+ raise ValueError(f"Expected a date column named '{DATE_COL}'")
83
+
84
+ df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
85
+ df = df[df[DATE_COL].notna()].copy()
86
+
87
+ df["year"] = df[DATE_COL].dt.year
88
+ df["month"] = df[DATE_COL].dt.to_period("M").dt.to_timestamp()
89
+ df["week"] = df[DATE_COL].dt.to_period("W").dt.start_time
90
+
91
+ return df
92
+
93
+
94
+
95
+ def apply_filters(
96
+ df: pd.DataFrame,
97
+ date_min=None,
98
+ date_max=None,
99
+ sessions=None,
100
+ chambers=None,
101
+ policy_domains=None,
102
+ category_main=None,
103
+ category_sub=None,
104
+ intents=None,
105
+ policy_dirs=None,
106
+ ):
107
+ out = df.copy()
108
+
109
+ if date_min is not None:
110
+ out = out[out["status_date"] >= pd.to_datetime(date_min)]
111
+ if date_max is not None:
112
+ out = out[out["status_date"] <= pd.to_datetime(date_max)]
113
+
114
+ def _filter_in(col, values):
115
+ nonlocal out
116
+ if values and "All" not in values:
117
+ out = out[out[col].isin(values)]
118
+
119
+ _filter_in("session", sessions)
120
+ _filter_in("chamber", chambers)
121
+ _filter_in("policy_domain_standardized", policy_domains)
122
+ _filter_in("category_main_label", category_main)
123
+ _filter_in("category_sub_label", category_sub)
124
+ _filter_in("intent_standardized", intents)
125
+ _filter_in("policy_direction_classifications", policy_dirs)
126
+
127
+ return out
128
+
129
+
130
+ def explode_keywords(df: pd.DataFrame, keyword_col: str) -> pd.DataFrame:
131
+ keep_cols = [
132
+ "bill_id",
133
+ "status_date",
134
+ "month",
135
+ "week",
136
+ "session",
137
+ "chamber",
138
+ "policy_domain_standardized",
139
+ "category_main_label",
140
+ "category_sub_label",
141
+ "intent_standardized",
142
+ "policy_direction_classifications",
143
+ keyword_col,
144
+ ]
145
+ keep_cols = [c for c in keep_cols if c in df.columns]
146
+
147
+ tmp = df[keep_cols].copy()
148
+ tmp["keyword_list"] = tmp[keyword_col].apply(_safe_listify)
149
+ tmp = tmp.explode("keyword_list", ignore_index=True)
150
+ tmp = tmp.rename(columns={"keyword_list": "keyword"})
151
+
152
+ tmp["keyword"] = tmp["keyword"].astype(str).str.strip()
153
+ tmp = tmp[(tmp["keyword"].notna()) & (tmp["keyword"] != "") & (tmp["keyword"].str.lower() != "nan")]
154
+
155
+ tmp["keyword_norm"] = (
156
+ tmp["keyword"]
157
+ .str.lower()
158
+ .str.replace(r"\s+", " ", regex=True)
159
+ .str.replace(r"[^a-z0-9 \-_/]", "", regex=True)
160
+ .str.strip()
161
+ )
162
+
163
+ tmp = tmp[tmp["keyword_norm"].str.len() >= 3]
164
+ return tmp
165
+
166
+
167
+ def keyword_trends(df_long: pd.DataFrame, time_grain="month", top_n=15):
168
+ tg = "month" if time_grain == "month" else "week"
169
+
170
+ top = (
171
+ df_long.groupby("keyword_norm")
172
+ .size()
173
+ .reset_index(name="count")
174
+ .sort_values("count", ascending=False)
175
+ .head(top_n)
176
+ )
177
+
178
+ top_set = set(top["keyword_norm"].tolist())
179
+ base = df_long[df_long["keyword_norm"].isin(top_set)]
180
+
181
+ ts = (
182
+ base.groupby([tg, "keyword_norm"])
183
+ .size()
184
+ .reset_index(name="mentions")
185
+ .sort_values([tg, "mentions"], ascending=[True, False])
186
+ )
187
+
188
+ return top, ts