taylerErbe commited on
Commit
bb9abee
·
verified ·
1 Parent(s): fca8511

Upload 8 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ faiss_index.bin filter=lfs diff=lfs merge=lfs -text
2
+ features_with_allbilldata.parquet filter=lfs diff=lfs merge=lfs -text
3
+ metadata.parquet filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Igpa Legislation Explorer
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: streamlit
7
+ app_file: app.py
8
+ app_port: 8501
9
+ tags:
10
+ - streamlit
11
+ - faiss
12
+ - semantic-search
13
+ pinned: false
14
+ short_description: IGPA semantic search and exploration of legislation
15
+ ---
16
+
17
+ # IGPA Legislation Explorer
18
+
19
+ This Streamlit app lets you perform semantic search over a corpus of legislation using a FAISS index and sentence-transformers embeddings. It supports:
20
+
21
+ - Free-text queries over bill summaries
22
+ - Filtering by intended beneficiaries, policy domain, and impact rating
23
+ - Viewing bill summaries, key provisions, and similarity scores
24
+ - Downloading search results as CSV for further analysis
25
+
26
+ To customize or extend the app, edit `app.py` at the root of this Space.
app.py ADDED
@@ -0,0 +1,803 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ import pandas as pd
5
+ import faiss
6
+ import streamlit as st
7
+ import altair as alt
8
+ from sentence_transformers import SentenceTransformer
9
+ import csv
10
+ from datetime import datetime
11
+
12
+ #Config
13
+ DB_DIR = "."
14
+ FEEDBACK_CSV = os.path.join(DB_DIR, "impact_feedback.csv")
15
+ DEFAULT_TOP_K = 10
16
+
17
+ IMPACT_ORDER = [
18
+ "Not Impactful",
19
+ "Slightly Impactful",
20
+ "Moderately Impactful",
21
+ "Very Impactful"
22
+ ]
23
+
24
+ st.set_page_config(
25
+ page_title="IGPA Legislation Explorer",
26
+ layout="wide",
27
+ initial_sidebar_state="expanded"
28
+ )
29
+
30
+ #Loading vector database
31
+ @st.cache_resource
32
+ def load_vector_db(db_dir: str = DB_DIR):
33
+ with open(os.path.join(db_dir, "config.json"), "r") as f:
34
+ cfg = json.load(f)
35
+
36
+ index = faiss.read_index(os.path.join(db_dir, "faiss_index.bin"))
37
+ meta = pd.read_parquet(os.path.join(db_dir, "metadata.parquet"))
38
+
39
+ if "vec_id" not in meta.columns:
40
+ meta = meta.reset_index().rename(columns={"index": "vec_id"})
41
+
42
+ model = SentenceTransformer(cfg["embedding_model_name"])
43
+ return index, meta, model, cfg
44
+
45
+ index, meta_df, embed_model, cfg = load_vector_db()
46
+
47
+ DATE_COL = "status_date_y"
48
+ meta_df[DATE_COL] = pd.to_datetime(
49
+ meta_df[DATE_COL],
50
+ errors="coerce"
51
+ )
52
+
53
+ DEFAULT_FILTERS = {
54
+ "intended_beneficiary": "All",
55
+ "policy_domain": "All",
56
+ "impact_selected": "All",
57
+ "category_main": "All",
58
+ "category_sub": "All",
59
+ "status_desc": "All",
60
+ "date_range": (
61
+ meta_df[DATE_COL].min().date(),
62
+ meta_df[DATE_COL].max().date()
63
+ )
64
+ }
65
+
66
+ for key, value in DEFAULT_FILTERS.items():
67
+ if key not in st.session_state:
68
+ st.session_state[key] = value
69
+
70
+ if "search_results" not in st.session_state:
71
+ st.session_state.search_results = None
72
+ if "current_query" not in st.session_state:
73
+ st.session_state.current_query = ""
74
+
75
+ def embed_query(query: str):
76
+ return embed_model.encode(
77
+ [query],
78
+ normalize_embeddings=True,
79
+ convert_to_numpy=True
80
+ ).astype("float32")
81
+
82
+ def impact_threshold(level):
83
+ if level not in IMPACT_ORDER:
84
+ return []
85
+ return IMPACT_ORDER[IMPACT_ORDER.index(level):]
86
+
87
+ def append_feedback_row(
88
+ bill_id,
89
+ predicted_impact,
90
+ user_response,
91
+ corrected_impact=None,
92
+ path=FEEDBACK_CSV,
93
+ ):
94
+ try:
95
+ file_exists = os.path.isfile(path)
96
+ with open(path, "a", newline="", encoding="utf-8") as f:
97
+ writer = csv.writer(f)
98
+ if not file_exists:
99
+ writer.writerow(
100
+ [
101
+ "timestamp",
102
+ "bill_id",
103
+ "predicted_impact",
104
+ "user_response",
105
+ "corrected_impact",
106
+ ]
107
+ )
108
+ writer.writerow(
109
+ [
110
+ datetime.utcnow().isoformat(),
111
+ bill_id,
112
+ predicted_impact,
113
+ user_response,
114
+ corrected_impact if corrected_impact else "",
115
+ ]
116
+ )
117
+
118
+ st.sidebar.success(f"Feedback saved to: `{path}`")
119
+ except Exception as e:
120
+ st.error(f"Failed to save feedback: {str(e)}")
121
+
122
+ def build_filter_mask(df, intended_beneficiary, policy_domain, impact_selected):
123
+ mask = pd.Series(True, index=df.index)
124
+
125
+ if intended_beneficiary != "All":
126
+ mask &= df["intended_beneficiaries_standardized"] == intended_beneficiary
127
+ if policy_domain != "All":
128
+ mask &= df["policy_domain_standardized"] == policy_domain
129
+ if impact_selected != "All":
130
+ allowed = impact_threshold(impact_selected)
131
+ mask &= df["impact_rating_standardized"].isin(allowed)
132
+ if st.session_state.category_main != "All":
133
+ mask &= df["category_main_label"] == st.session_state.category_main
134
+ if st.session_state.category_sub != "All":
135
+ mask &= df["category_sub_label"] == st.session_state.category_sub
136
+ if "status_desc" in st.session_state and st.session_state.status_desc != "All":
137
+ mask &= df["status_desc"] == st.session_state.status_desc
138
+ if "date_range" in st.session_state and st.session_state.date_range:
139
+ dr = st.session_state.date_range
140
+
141
+ if isinstance(dr, (tuple, list)) and len(dr) == 2:
142
+ start, end = dr
143
+ else:
144
+ start = end = dr
145
+ if end == start:
146
+ end = df[DATE_COL].max().date()
147
+
148
+ start = pd.to_datetime(start)
149
+ end = pd.to_datetime(end)
150
+
151
+ mask &= df[DATE_COL].between(start, end)
152
+ return mask
153
+
154
+ def get_sorted_filter_options(df, col_name):
155
+ counts = df[col_name].dropna().value_counts()
156
+ sorted_vals = counts.index.tolist()
157
+ return ["All"] + sorted_vals
158
+
159
+ def reset_filters():
160
+ for key, value in DEFAULT_FILTERS.items():
161
+ st.session_state[key] = value
162
+ st.rerun()
163
+
164
+ #Filters
165
+ with st.sidebar:
166
+ st.header("Filters")
167
+ if "history" not in st.session_state:
168
+ st.session_state.history = []
169
+ if st.button("Reset Filters"):
170
+ reset_filters()
171
+
172
+ intended_beneficiary = st.selectbox(
173
+ "Intended Beneficiary",
174
+ get_sorted_filter_options(meta_df, "intended_beneficiaries_standardized"),
175
+ key="intended_beneficiary"
176
+ )
177
+
178
+ policy_domain = st.selectbox(
179
+ "Policy Area",
180
+ get_sorted_filter_options(meta_df, "policy_domain_standardized"),
181
+ key="policy_domain"
182
+ )
183
+
184
+ impact_selected = st.selectbox(
185
+ "Impact Rating (≥ Selected Level)",
186
+ ["All"] + IMPACT_ORDER,
187
+ key="impact_selected"
188
+ )
189
+
190
+ category_main = st.selectbox(
191
+ "Category",
192
+ get_sorted_filter_options(meta_df, "category_main_label"),
193
+ key="category_main"
194
+ )
195
+
196
+ category_sub = st.selectbox(
197
+ "Sub Category",
198
+ get_sorted_filter_options(meta_df, "category_sub_label"),
199
+ key="category_sub"
200
+ )
201
+
202
+ top_k = st.slider("Number of results", 5, 50, DEFAULT_TOP_K, 5)
203
+
204
+ status_desc = st.selectbox(
205
+ "Bill Status",
206
+ ["All"] + sorted(meta_df["status_desc"].dropna().unique().tolist()),
207
+ key="status_desc"
208
+ )
209
+
210
+ st.subheader("Time Filter")
211
+
212
+ min_date = meta_df[DATE_COL].min().date()
213
+ max_date = meta_df[DATE_COL].max().date()
214
+
215
+ default_value = st.session_state.get("date_range", (min_date, max_date))
216
+
217
+ if isinstance(default_value, (tuple, list)):
218
+ if len(default_value) == 2:
219
+ start, end = default_value
220
+ else:
221
+ start = end = default_value[0]
222
+ else:
223
+ start = end = default_value
224
+
225
+ st.date_input(
226
+ "Status Date Range",
227
+ value=(start, end),
228
+ min_value=min_date,
229
+ max_value=max_date,
230
+ key="date_range"
231
+ )
232
+
233
+ if os.path.exists(FEEDBACK_CSV):
234
+ try:
235
+ df_feedback = pd.read_csv(FEEDBACK_CSV)
236
+ st.info(f" Feedback records: {len(df_feedback)}")
237
+ if st.button(" Download Feedback CSV"):
238
+ st.download_button(
239
+ label="Download impact_feedback.csv",
240
+ data=open(FEEDBACK_CSV, 'rb').read(),
241
+ file_name="impact_feedback.csv",
242
+ mime="text/csv"
243
+ )
244
+ except:
245
+ st.info("Feedback CSV ready (empty)")
246
+
247
+ filtered_df = meta_df[
248
+ build_filter_mask(
249
+ meta_df,
250
+ st.session_state.intended_beneficiary,
251
+ st.session_state.policy_domain,
252
+ st.session_state.impact_selected
253
+ )
254
+ ]
255
+
256
+ tab_search, tab_trends = st.tabs(["Search & Results", "Trends & Insights"])
257
+
258
+ #Search Tab
259
+ with tab_search:
260
+ st.title("IGPA Legislation Explorer")
261
+
262
+ #Overview
263
+ col1, col2, col3, col4 = st.columns(4)
264
+
265
+ with col1:
266
+ st.metric("Total Bills", len(filtered_df))
267
+
268
+ with col2:
269
+ st.metric(
270
+ "Policy Domains",
271
+ filtered_df["policy_domain_standardized"].nunique()
272
+ )
273
+
274
+ with col3:
275
+ st.metric(
276
+ "Beneficiary Groups",
277
+ filtered_df["intended_beneficiaries_standardized"].nunique()
278
+ )
279
+
280
+ with col4:
281
+ impact_counts = (
282
+ filtered_df["impact_rating_standardized"]
283
+ .dropna()
284
+ .value_counts()
285
+ .reindex(IMPACT_ORDER, fill_value=0)
286
+ )
287
+ st.metric("Impact Breakdown", len(filtered_df))
288
+ st.markdown(
289
+ f"<div style='font-size:12px; color:#6b7280;'>"
290
+ f"Very Impactful: <b>{impact_counts['Very Impactful']}</b> | "
291
+ f"Moderately: <b>{impact_counts['Moderately Impactful']}</b> | "
292
+ f"Slightly: <b>{impact_counts['Slightly Impactful']}</b> | "
293
+ f"Not: <b>{impact_counts['Not Impactful']}</b>"
294
+ f"</div>",
295
+ unsafe_allow_html=True
296
+ )
297
+
298
+ #Most Impacted Beneficiary Categories
299
+ st.subheader("Most Impacted Beneficiary Categories")
300
+
301
+ impact_df = (
302
+ filtered_df.dropna(subset=["beneficiary_category", "impact_rating_score"])
303
+ .groupby("beneficiary_category")
304
+ .agg(
305
+ avg_impact=("impact_rating_score", "mean"),
306
+ bills=("bill_id","count"),
307
+ top_bills=("title", lambda x: "; ".join(x.head(5))),
308
+ top_beneficiaries=("intended_beneficiaries_standardized", lambda x: ", ".join(x.value_counts().head(3).index))
309
+ )
310
+ .reset_index()
311
+ .sort_values("avg_impact", ascending=False)
312
+ .head(10)
313
+ )
314
+
315
+ if not impact_df.empty:
316
+ st.altair_chart(
317
+ alt.Chart(impact_df)
318
+ .mark_bar()
319
+ .encode(
320
+ x=alt.X("beneficiary_category:N", sort="-y", title="Beneficiary Category"),
321
+ y=alt.Y("avg_impact:Q", title="Average Impact Score"),
322
+ color=alt.Color(
323
+ "avg_impact:Q",
324
+ scale=alt.Scale(domain=[0,4], range=["#FFF176","#E53935"]),
325
+ legend=alt.Legend(title="Impact Severity")
326
+ ),
327
+ tooltip=[
328
+ alt.Tooltip("beneficiary_category:N", title="Beneficiary"),
329
+ alt.Tooltip("avg_impact:Q", format=".2f", title="Average Impact"),
330
+ alt.Tooltip("bills:Q", title="Number of Bills"),
331
+ alt.Tooltip("top_bills:N", title="Top Bills"),
332
+ alt.Tooltip("top_beneficiaries:N", title="Top Beneficiaries")
333
+ ]
334
+ )
335
+ .properties(height=350),
336
+ use_container_width=True
337
+ )
338
+
339
+ # Bills from Filters
340
+ st.subheader("Bills Matching Selected Filters")
341
+
342
+ display_cols = {
343
+ "bill_number": "Bill Number",
344
+ "title": "Title",
345
+ "description": "Description",
346
+ "policy_domain_standardized": "Policy Domain",
347
+ "category_main_label": "Category",
348
+ "intent_standardized": "Intent",
349
+ "legislative_goal_standardized": "Legislative Goal",
350
+ "beneficiary_category": "Beneficiary Group",
351
+ "intended_beneficiaries_standardized": "Intended Beneficiaries",
352
+ "potential_impact_raw": "Potential Impact",
353
+ "impact_rating_standardized": "Impact Rating",
354
+ "status_desc": "Status",
355
+ "full_text_url": "Bill Link"
356
+ }
357
+
358
+ available_cols = {k: v for k, v in display_cols.items() if k in filtered_df.columns}
359
+
360
+ filter_bill_df = (
361
+ filtered_df[list(available_cols.keys())]
362
+ .rename(columns=available_cols)
363
+ .copy()
364
+ )
365
+
366
+ st.dataframe(
367
+ filter_bill_df,
368
+ use_container_width=True,
369
+ column_config={
370
+ "Bill Link": st.column_config.LinkColumn(
371
+ label="Bill Link",
372
+ display_text="Open Bill"
373
+ )
374
+ }
375
+ )
376
+
377
+ st.markdown("---")
378
+
379
+ #Search Bills
380
+ st.subheader("Search Bills")
381
+ query = st.text_area(
382
+ "Ask a question about legislation",
383
+ value=st.session_state.current_query,
384
+ height=80,
385
+ placeholder="Example: bills related to funding",
386
+ key="search_query_input"
387
+ )
388
+
389
+ search_clicked = st.button("Search", key="search_button")
390
+
391
+ if search_clicked and query.strip():
392
+ st.session_state.current_query = query
393
+ st.session_state.history.append({"query": query})
394
+
395
+ q_vec = embed_query(query)
396
+ n_search = min(len(meta_df), top_k*5)
397
+ scores, ids = index.search(q_vec, n_search)
398
+ ids, scores = ids[0], scores[0]
399
+
400
+ allowed = set(filtered_df.index)
401
+ kept = [(i,s) for i,s in zip(ids,scores) if i in allowed][:top_k]
402
+
403
+ if not kept:
404
+ st.warning("No results found.")
405
+ st.session_state.search_results = None
406
+ else:
407
+ results = meta_df.loc[[i for i,_ in kept]].copy()
408
+ results["similarity"] = [s for _,s in kept]
409
+ st.session_state.search_results = results
410
+
411
+ if st.session_state.search_results is not None:
412
+ results = st.session_state.search_results
413
+
414
+ #Filtered Results Table
415
+ st.subheader("Filtered Results Table")
416
+ review_cols = [
417
+ "bill_number",
418
+ "title",
419
+ "description",
420
+ "potential_impact_raw",
421
+ "increasing_aspects_standardized",
422
+ "decreasing_aspects_standardized",
423
+ "similarity",
424
+ "full_text_url"
425
+ ]
426
+
427
+ review_df = results[[c for c in review_cols if c in results.columns]].copy()
428
+
429
+ review_df.rename(
430
+ columns={
431
+ "bill_number": "Bill Number",
432
+ "title": "Title",
433
+ "description": "Description",
434
+ "potential_impact_raw": "Potential Impact",
435
+ "increasing_aspects_standardized": "Increasing Aspects",
436
+ "decreasing_aspects_standardized": "Decreasing Aspects",
437
+ "similarity": "Score",
438
+ "full_text_url": "Bill URL"
439
+ },
440
+ inplace=True
441
+ )
442
+
443
+ st.dataframe(
444
+ review_df,
445
+ use_container_width=True,
446
+ column_config={
447
+ "Bill URL": st.column_config.LinkColumn(
448
+ "ILGA URL",
449
+ display_text="Open bill"
450
+ )
451
+ }
452
+ )
453
+
454
+ st.markdown("---")
455
+
456
+ st.subheader("Filtered Results")
457
+ for idx, row in results.iterrows():
458
+ with st.container():
459
+ st.markdown(f"### Bill Number: {row['bill_number']}")
460
+ st.markdown(f"**Title:** {row['title']}")
461
+ st.write(row["description"])
462
+
463
+ if pd.notna(row.get("category_main_label")):
464
+ st.write(f"**Main Category**: {row['category_main_label']}")
465
+
466
+ if pd.notna(row.get("category_sub_label")):
467
+ st.write(f"**Sub Category**: {row['category_sub_label']}")
468
+
469
+ if pd.notna(row.get("llama_summary_raw")):
470
+ st.markdown(f"**LLaMA Summary:** {row['llama_summary_raw']}")
471
+
472
+ info_text = (
473
+ f"Session: {row.get('session','')} • "
474
+ f"Chamber: {row.get('chamber','')} • "
475
+ f"Impact: {row.get('impact_rating_standardized','')} • "
476
+ f"Beneficiaries: {row.get('intended_beneficiaries_standardized','')} • "
477
+ f"Domain: {row.get('policy_domain_standardized','')} • "
478
+ f"Similarity: {row.get('similarity'):.3f}"
479
+ )
480
+ st.caption(info_text)
481
+
482
+ if pd.notna(row.get("full_text_url")):
483
+ st.markdown(f"[🔗 View Full Bill]({row['full_text_url']})", unsafe_allow_html=True)
484
+
485
+ std_cols = [
486
+ c for c in results.columns
487
+ if c.endswith("_standardized") and c not in [
488
+ "impact_rating_standardized",
489
+ "increasing_aspects_standardized",
490
+ "decreasing_aspects_standardized",
491
+ "original_law_standardized"
492
+ ]
493
+ ]
494
+
495
+ with st.expander("More Details"):
496
+ for c in std_cols:
497
+ val = row.get(c)
498
+ if pd.notna(val) and str(val).strip():
499
+ label = c.replace("_standardized","").replace("_"," ").title()
500
+ st.write(f"**{label}**: {val}")
501
+
502
+ with st.expander("Similar Bills"):
503
+ sim_df = results.iloc[:5][
504
+ ["bill_number","title","description","full_text_url"]
505
+ ].copy()
506
+ st.dataframe(
507
+ sim_df,
508
+ use_container_width=True,
509
+ column_config={
510
+ "full_text_url": st.column_config.LinkColumn(
511
+ "Bill Link",
512
+ display_text="Open"
513
+ )
514
+ }
515
+ )
516
+
517
+ #Impact rating feedbacK
518
+ with st.expander("👍👎 Rate Impact Accuracy", expanded=False):
519
+ st.markdown("**Is this impact rating accurate?**")
520
+ predicted_impact = row.get("impact_rating_standardized", "")
521
+ bill_id_safe = str(row.get('bill_id', idx))
522
+
523
+ # Check if feedback was already submitted for this bill
524
+ feedback_submitted = st.session_state.get(f"feedback_done_{bill_id_safe}", False)
525
+
526
+ if feedback_submitted:
527
+ st.success("Thank you for your feedback!")
528
+ st.caption(f"Bill: {row.get('bill_number', 'N/A')} | Saved to impact_feedback.csv")
529
+ else:
530
+ col1, col2 = st.columns(2)
531
+ with col1:
532
+ if st.button("👍 **Yes - Accurate**", key=f"yes_{bill_id_safe}", use_container_width=True):
533
+ append_feedback_row(
534
+ bill_id=bill_id_safe,
535
+ predicted_impact=predicted_impact,
536
+ user_response="Yes",
537
+ corrected_impact=None,
538
+ )
539
+ st.session_state[f"feedback_done_{bill_id_safe}"] = True
540
+ st.sidebar.success(f"Feedback saved for {row.get('bill_number', bill_id_safe)}")
541
+ st.rerun()
542
+
543
+ with col2:
544
+ if st.button("👎 **No - Incorrect**", key=f"no_{bill_id_safe}", use_container_width=True):
545
+ st.session_state[f"show_corrected_{bill_id_safe}"] = True
546
+ st.rerun()
547
+
548
+ if st.session_state.get(f"show_corrected_{bill_id_safe}", False):
549
+ st.info(f"**What should the impact rating be instead?**")
550
+ corrected_value = st.selectbox(
551
+ "**Correct impact rating**",
552
+ IMPACT_ORDER,
553
+ key=f"corrected_{bill_id_safe}",
554
+ )
555
+
556
+ col_submit, col_cancel = st.columns([3, 1])
557
+ with col_submit:
558
+ if st.button("**Submit Feedback**", key=f"submit_{bill_id_safe}", type="primary"):
559
+ append_feedback_row(
560
+ bill_id=bill_id_safe,
561
+ predicted_impact=predicted_impact,
562
+ user_response="No",
563
+ corrected_impact=corrected_value,
564
+ )
565
+ st.session_state[f"feedback_done_{bill_id_safe}"] = True
566
+ st.session_state[f"show_corrected_{bill_id_safe}"] = False
567
+ st.sidebar.success(f"Feedback saved for {row.get('bill_number', bill_id_safe)}")
568
+ st.rerun()
569
+ with col_cancel:
570
+ if st.button("Cancel", key=f"cancel_{bill_id_safe}"):
571
+ st.session_state[f"show_corrected_{bill_id_safe}"] = False
572
+ st.rerun()
573
+
574
+ #Search History
575
+ with st.sidebar.expander("Search History"):
576
+ for i,item in enumerate(reversed(st.session_state.history[-5:]),1):
577
+ st.write(f"{i}. {item.get('query','')}")
578
+
579
+
580
+ # TRENDS TAB
581
+ with tab_trends:
582
+ st.subheader("Trends & Insights")
583
+
584
+ # Key Insights
585
+ top_policy = filtered_df["policy_domain_standardized"].value_counts().head(1)
586
+ top_beneficiaries = filtered_df["beneficiary_category"].value_counts().head(1)
587
+ strategy_impact = (
588
+ filtered_df[filtered_df["impact_rating_standardized"].notna()]
589
+ .groupby("legislative_strategy_standardized")["impact_rating_standardized"]
590
+ .apply(lambda x: (x=="Very Impactful").sum())
591
+ )
592
+ avg_impact_ben = (
593
+ filtered_df.dropna(subset=["impact_rating_score"])
594
+ .groupby("beneficiary_category")["impact_rating_score"]
595
+ .mean()
596
+ .sort_values(ascending=False)
597
+ )
598
+
599
+ total_bills = len(filtered_df)
600
+ total_high_impact = (filtered_df["impact_rating_standardized"]=="Very Impactful").sum()
601
+
602
+ st.markdown("### Key Insights")
603
+ st.write(f"**Total Bills Considered:** {total_bills}")
604
+ st.write(f"**Total Very Impactful Bills:** {total_high_impact}")
605
+ st.write(f"**Most Active Policy Domain:** {top_policy.index[0]} ({top_policy.iloc[0]} bills)" if not top_policy.empty else "No data")
606
+ st.write(f"**Most Benefited Group:** {top_beneficiaries.index[0]} ({top_beneficiaries.iloc[0]} bills)" if not top_beneficiaries.empty else "No data")
607
+ st.write(f"**Strategy Producing Most Very Impactful Bills:** {strategy_impact.idxmax() if not strategy_impact.empty else 'N/A'}")
608
+ st.write(f"**Highest Average Impact (Beneficiary):** {avg_impact_ben.index[0]} ({avg_impact_ben.iloc[0]:.2f})" if not avg_impact_ben.empty else "N/A")
609
+ st.markdown("---")
610
+
611
+ col1, col2 = st.columns(2)
612
+
613
+ # Policy Domain
614
+ with col1:
615
+ st.markdown("### Policy Domain Activity")
616
+ policy_agg = (
617
+ filtered_df.groupby("policy_domain_standardized")
618
+ .agg(
619
+ Count=("bill_id","count"),
620
+ avg_impact=("impact_rating_score","mean"),
621
+ top_bills=("title", lambda x: "; ".join(x.head(5))),
622
+ top_beneficiaries=("intended_beneficiaries_standardized", lambda x: ", ".join(x.value_counts().head(3).index)),
623
+ recent_date=("status_date_y", lambda x: x.max().strftime("%Y-%m-%d")),
624
+ bill_numbers=("bill_number", lambda x: ", ".join(map(str, x.head(5))))
625
+ )
626
+ .reset_index()
627
+ .rename(columns={"policy_domain_standardized":"Policy Domain"})
628
+ )
629
+ policy_chart = (
630
+ alt.Chart(policy_agg)
631
+ .mark_bar()
632
+ .encode(
633
+ x=alt.X("Policy Domain:N", sort="-y", title="Policy Domain"),
634
+ y=alt.Y("Count:Q", title="Number of Bills"),
635
+ color=alt.Color("Count:Q", scale=alt.Scale(scheme="reds"), legend=None),
636
+ tooltip=[
637
+ alt.Tooltip("Policy Domain:N"),
638
+ alt.Tooltip("Count:Q", title="Number of Bills"),
639
+ alt.Tooltip("avg_impact:Q", format=".2f", title="Average Impact"),
640
+ alt.Tooltip("top_bills:N", title="Top Bills"),
641
+ alt.Tooltip("top_beneficiaries:N", title="Top Beneficiaries"),
642
+ alt.Tooltip("recent_date:N", title="Most Recent Bill"),
643
+ alt.Tooltip("bill_numbers:N", title="Bill Numbers")
644
+ ]
645
+ )
646
+ .properties(height=400)
647
+ )
648
+ st.altair_chart(policy_chart, use_container_width=True)
649
+
650
+ # Impact Distribution
651
+ with col2:
652
+ st.markdown("### Impact Distribution")
653
+ impact_dist = (
654
+ filtered_df[filtered_df["impact_rating_standardized"].notna()]["impact_rating_standardized"]
655
+ .value_counts()
656
+ .reindex(IMPACT_ORDER, fill_value=0)
657
+ .reset_index()
658
+ )
659
+ impact_dist.columns = ["Impact Level", "Count"]
660
+
661
+ impact_chart = (
662
+ alt.Chart(impact_dist)
663
+ .mark_bar()
664
+ .encode(
665
+ x=alt.X("Impact Level:N", sort=IMPACT_ORDER),
666
+ y=alt.Y("Count:Q"),
667
+ color=alt.Color("Count:Q", scale=alt.Scale(scheme="reds")),
668
+ tooltip=[
669
+ alt.Tooltip("Impact Level:N"),
670
+ alt.Tooltip("Count:Q")
671
+ ]
672
+ )
673
+ .properties(height=300)
674
+ )
675
+ st.altair_chart(impact_chart, use_container_width=True)
676
+
677
+ # Strategy High Impact
678
+ st.markdown("### Legislative Strategy: Very Impactful Bills")
679
+ strategy_high_impact = (
680
+ filtered_df[filtered_df["impact_rating_standardized"].notna()]
681
+ .groupby("legislative_strategy_standardized")
682
+ .agg(
683
+ Very_Impactful_Bills=("impact_rating_standardized", lambda x: (x=="Very Impactful").sum()),
684
+ top_bills=("title", lambda x: "; ".join(x.head(5))),
685
+ top_beneficiaries=("intended_beneficiaries_standardized", lambda x: ", ".join(x.value_counts().head(3).index)),
686
+ recent_date=("status_date_y", lambda x: x.max().strftime("%Y-%m-%d"))
687
+ )
688
+ .reset_index()
689
+ .rename(columns={"legislative_strategy_standardized":"Strategy"})
690
+ )
691
+
692
+ strategy_chart = (
693
+ alt.Chart(strategy_high_impact)
694
+ .mark_bar()
695
+ .encode(
696
+ x=alt.X("Strategy:N", sort="-y", title="Strategy"),
697
+ y=alt.Y("Very_Impactful_Bills:Q", title="Very Impactful Bills"),
698
+ color=alt.Color("Very_Impactful_Bills:Q", scale=alt.Scale(scheme="orangered")),
699
+ tooltip=[
700
+ alt.Tooltip("Strategy:N"),
701
+ alt.Tooltip("Very_Impactful_Bills:Q"),
702
+ alt.Tooltip("top_bills:N", title="Top Bills"),
703
+ alt.Tooltip("top_beneficiaries:N", title="Top Beneficiaries"),
704
+ alt.Tooltip("recent_date:N", title="Most Recent Bill")
705
+ ]
706
+ )
707
+ .properties(height=400)
708
+ )
709
+
710
+ st.altair_chart(strategy_chart, use_container_width=True)
711
+
712
+ # Impact by Category
713
+ st.markdown("### Impact by Category")
714
+ impact_cat = (
715
+ filtered_df[
716
+ filtered_df["impact_rating_standardized"].notna() &
717
+ filtered_df["category_main_label"].notna()
718
+ ]
719
+ .groupby(["category_main_label", "impact_rating_standardized"])
720
+ .agg(
721
+ Count=("bill_id","count"),
722
+ avg_impact=("impact_rating_score","mean"),
723
+ top_bills=("title", lambda x: "; ".join(x.head(5))),
724
+ top_beneficiaries=("intended_beneficiaries_standardized", lambda x: ", ".join(x.value_counts().head(3).index)),
725
+ recent_date=("status_date_y", lambda x: x.max().strftime("%Y-%m-%d")),
726
+ bill_numbers=("bill_number", lambda x: ", ".join(map(str, x.head(5))))
727
+ )
728
+ .reset_index()
729
+ )
730
+
731
+ if impact_cat.empty:
732
+ st.write("No data available for impact by category.")
733
+ else:
734
+ top_categories = (
735
+ impact_cat.groupby("category_main_label")["Count"]
736
+ .sum()
737
+ .sort_values(ascending=False)
738
+ .head(15)
739
+ .index.tolist()
740
+ )
741
+ impact_cat_top = impact_cat[impact_cat["category_main_label"].isin(top_categories)]
742
+
743
+ impact_cat_chart = (
744
+ alt.Chart(impact_cat_top)
745
+ .mark_bar()
746
+ .encode(
747
+ y=alt.Y("category_main_label:N", sort=top_categories, title="Category"),
748
+ x=alt.X("Count:Q", stack="zero", title="Number of Bills"),
749
+ color=alt.Color("impact_rating_standardized:N", sort=IMPACT_ORDER, scale=alt.Scale(scheme="reds"), title="Impact Rating"),
750
+ tooltip=[
751
+ alt.Tooltip("category_main_label:N", title="Category"),
752
+ alt.Tooltip("impact_rating_standardized:N", title="Impact Rating"),
753
+ alt.Tooltip("Count:Q", title="Number of Bills"),
754
+ alt.Tooltip("avg_impact:Q", format=".2f", title="Average Impact"),
755
+ alt.Tooltip("top_bills:N", title="Top Bills"),
756
+ alt.Tooltip("top_beneficiaries:N", title="Top Beneficiaries"),
757
+ alt.Tooltip("recent_date:N", title="Most Recent Bill"),
758
+ alt.Tooltip("bill_numbers:N", title="Bill Numbers")
759
+ ]
760
+ )
761
+ .properties(height=400)
762
+ )
763
+
764
+ st.altair_chart(impact_cat_chart, use_container_width=True)
765
+
766
+ # Beneficiary Treemap
767
+ st.markdown("### Beneficiary Coverage & Average Impact")
768
+ ben_treemap_df = (
769
+ filtered_df.dropna(subset=["beneficiary_category", "impact_rating_score"])
770
+ .groupby("beneficiary_category")
771
+ .agg(
772
+ total_bills=("bill_id","count"),
773
+ avg_impact=("impact_rating_score","mean"),
774
+ top_bills=("title", lambda x: "; ".join(x.head(5))),
775
+ recent_date=("status_date_y", lambda x: x.max().strftime("%Y-%m-%d")),
776
+ bill_numbers=("bill_number", lambda x: ", ".join(map(str, x.head(5))))
777
+ )
778
+ .reset_index()
779
+ )
780
+
781
+ if not ben_treemap_df.empty:
782
+ treemap = (
783
+ alt.Chart(ben_treemap_df)
784
+ .mark_rect()
785
+ .encode(
786
+ x=alt.X("total_bills:Q", title="Number of Bills"),
787
+ y=alt.Y("beneficiary_category:N", sort="-x", title="Beneficiary Category"),
788
+ size="total_bills:Q",
789
+ color=alt.Color("avg_impact:Q", scale=alt.Scale(domain=[0,4], range=["#FFF176","#E53935"]), legend=alt.Legend(title="Average Impact Score")),
790
+ tooltip=[
791
+ alt.Tooltip("beneficiary_category:N", title="Beneficiary"),
792
+ alt.Tooltip("total_bills:Q", title="Number of Bills"),
793
+ alt.Tooltip("avg_impact:Q", format=".2f", title="Average Impact"),
794
+ alt.Tooltip("top_bills:N", title="Top Bills"),
795
+ alt.Tooltip("recent_date:N", title="Most Recent Bill"),
796
+ alt.Tooltip("bill_numbers:N", title="Bill Numbers")
797
+ ]
798
+ )
799
+ .properties(height=400)
800
+ )
801
+ st.altair_chart(treemap, use_container_width=True)
802
+ else:
803
+ st.write("No beneficiary impact data available for selected filters.")
config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "embedding_model_name": "all-MiniLM-L6-v2",
3
+ "text_column": "llama_summary_raw",
4
+ "id_column": "bill_id",
5
+ "embedding_dimension": 384,
6
+ "top_k_default": 10
7
+ }
faiss_index.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ada3c96d32ced3e164cd9492ac0c0173aee511d7145dad24d3ca17f2709e44c0
3
+ size 7475757
features_with_allbilldata.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06ae4c4e38fdc149676ee7504d9bdf100786b1583f68a267b56f9b616951550c
3
+ size 23351162
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
metadata.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3f5a5bfeec0216df7642ccee3a4ec410277a554daa369cf11c58099490fdfe7
3
+ size 23370903
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ faiss-cpu
5
+ sentence-transformers
6
+ pyarrow