Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -148,6 +148,157 @@
|
|
| 148 |
# st.subheader("RAG-Generated Overall Summary")
|
| 149 |
# summary = rag_summarize(collected, summarizer)
|
| 150 |
# st.success(summary)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
import streamlit as st
|
| 152 |
import pandas as pd
|
| 153 |
import re
|
|
@@ -212,6 +363,27 @@ def extract_month_year(q):
|
|
| 212 |
yr = int(ym.group()) if ym else None
|
| 213 |
return mon, yr
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
def extract_topic_match(query, df):
|
| 216 |
query_lower = query.lower()
|
| 217 |
return df[
|
|
@@ -228,21 +400,27 @@ st.markdown("Ask about trends in topics like higher education, funding, etc.")
|
|
| 228 |
df = load_data()
|
| 229 |
embed_model, summarizer = load_models()
|
| 230 |
|
| 231 |
-
query = st.text_input("Ask a question (e.g., ‘Trends
|
| 232 |
|
| 233 |
if query:
|
| 234 |
-
|
| 235 |
df2 = extract_topic_match(query, df)
|
| 236 |
|
| 237 |
if df2.empty:
|
| 238 |
df2 = df
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
if df2.empty:
|
| 248 |
st.warning("No matching records found.")
|
|
@@ -257,15 +435,15 @@ if query:
|
|
| 257 |
st.subheader("Top Matching Insights")
|
| 258 |
collected = []
|
| 259 |
|
| 260 |
-
for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: #
|
| 261 |
row = df2.iloc[idx]
|
| 262 |
date = row['status_date'].date()
|
| 263 |
bill_number = row['bill_number']
|
| 264 |
full_url = row['url']
|
| 265 |
cat = row['Category & Subcategory']
|
| 266 |
cat_std = row['category_&_subcategory_standardized2']
|
| 267 |
-
bene= row['Intended Beneficiaries']
|
| 268 |
-
bene_std= row['intended_beneficiaries_standardized2']
|
| 269 |
goal = row['Legislative Goal']
|
| 270 |
impact = row['Policy Impact Areas']
|
| 271 |
provision = row['Key Provisions']
|
|
@@ -273,21 +451,20 @@ if query:
|
|
| 273 |
stance = row['Stance']
|
| 274 |
description = row['description']
|
| 275 |
summary = row['summary']
|
| 276 |
-
|
| 277 |
trend = clean_text(row['llama_trend_summary'])
|
| 278 |
insight = clean_text(row['llama_insight'])
|
| 279 |
|
| 280 |
st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
|
| 281 |
st.markdown(f"**Category:** {cat}")
|
| 282 |
-
|
| 283 |
st.markdown(f"**Intended Beneficiaries:** {bene}")
|
| 284 |
-
|
| 285 |
st.markdown(f"**Goal:** {goal}")
|
| 286 |
st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
|
| 287 |
st.markdown(f"**Policy Impact Area:** {impact}")
|
| 288 |
st.markdown(f"**Key Provision:** {provision}")
|
| 289 |
st.markdown(f"**Description:** {description}")
|
| 290 |
-
|
| 291 |
st.markdown(f"**Trend Summary:** {trend}")
|
| 292 |
st.markdown(f"**Actionable Insight:** {insight}")
|
| 293 |
st.markdown(f"[View Full Bill Text]({full_url})\n")
|
|
@@ -299,4 +476,3 @@ if query:
|
|
| 299 |
summary = rag_summarize(collected, summarizer)
|
| 300 |
st.success(summary)
|
| 301 |
|
| 302 |
-
|
|
|
|
| 148 |
# st.subheader("RAG-Generated Overall Summary")
|
| 149 |
# summary = rag_summarize(collected, summarizer)
|
| 150 |
# st.success(summary)
|
| 151 |
+
# import streamlit as st
|
| 152 |
+
# import pandas as pd
|
| 153 |
+
# import re
|
| 154 |
+
# from sentence_transformers import SentenceTransformer
|
| 155 |
+
# from transformers import pipeline
|
| 156 |
+
# from sklearn.metrics.pairwise import cosine_similarity
|
| 157 |
+
# from sklearn.feature_extraction.text import TfidfVectorizer
|
| 158 |
+
# from datetime import datetime
|
| 159 |
+
|
| 160 |
+
# def clean_text(text):
|
| 161 |
+
# text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
|
| 162 |
+
# text = re.sub(r"(?i)let me know if you'd like.*", "", text)
|
| 163 |
+
# text = re.sub(r"(?i)trend summary[:]*", "", text)
|
| 164 |
+
# text = re.sub(r"(?i)actionable insight[:]*", "", text)
|
| 165 |
+
# return text.strip()
|
| 166 |
+
|
| 167 |
+
# @st.cache_data
|
| 168 |
+
# def load_data():
|
| 169 |
+
# df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
|
| 170 |
+
# df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
|
| 171 |
+
# df = df.dropna(subset=['status_date'])
|
| 172 |
+
# df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
|
| 173 |
+
# df["llama_insight"] = df["llama_insight"].fillna("")
|
| 174 |
+
# df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
|
| 175 |
+
# return df
|
| 176 |
+
|
| 177 |
+
# @st.cache_resource
|
| 178 |
+
# def load_models():
|
| 179 |
+
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 180 |
+
# summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
|
| 181 |
+
# return embed_model, summarizer
|
| 182 |
+
|
| 183 |
+
# @st.cache_data
|
| 184 |
+
# def compute_embeddings(texts, _model):
|
| 185 |
+
# return _model.encode(texts, show_progress_bar=True)
|
| 186 |
+
|
| 187 |
+
# def semantic_search(query, embeddings, model, threshold=0.5):
|
| 188 |
+
# query_embedding = model.encode([query])
|
| 189 |
+
# sims = cosine_similarity(query_embedding, embeddings)[0]
|
| 190 |
+
# return [(i, s) for i, s in enumerate(sims) if s > threshold]
|
| 191 |
+
|
| 192 |
+
# def rag_summarize(texts, summarizer, top_k=10): # increased from 5 to 10
|
| 193 |
+
# if not texts:
|
| 194 |
+
# return "No relevant content to summarize."
|
| 195 |
+
# vect = TfidfVectorizer()
|
| 196 |
+
# m = vect.fit_transform(texts)
|
| 197 |
+
# mean_vec = m.mean(axis=0).A
|
| 198 |
+
# scores = cosine_similarity(mean_vec, m).flatten()
|
| 199 |
+
# top_indices = scores.argsort()[::-1][:top_k]
|
| 200 |
+
# ctx = "\n".join(texts[i] for i in top_indices)
|
| 201 |
+
# prompt = "summarize: " + ctx[:1024]
|
| 202 |
+
# out = summarizer(prompt, max_length=150, min_length=80, do_sample=False) # updated length
|
| 203 |
+
# return out[0]['summary_text']
|
| 204 |
+
|
| 205 |
+
# def extract_month_year(q):
|
| 206 |
+
# month_map = {m: i for i, m in enumerate(
|
| 207 |
+
# ["january", "february", "march", "april", "may", "june",
|
| 208 |
+
# "july", "august", "september", "october", "november", "december"], 1)}
|
| 209 |
+
# ql = q.lower()
|
| 210 |
+
# mon = next((v for k, v in month_map.items() if k in ql), None)
|
| 211 |
+
# ym = re.search(r"(19|20)\d{2}", q)
|
| 212 |
+
# yr = int(ym.group()) if ym else None
|
| 213 |
+
# return mon, yr
|
| 214 |
+
|
| 215 |
+
# def extract_topic_match(query, df):
|
| 216 |
+
# query_lower = query.lower()
|
| 217 |
+
# return df[
|
| 218 |
+
# df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
|
| 219 |
+
# df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
|
| 220 |
+
# df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
|
| 221 |
+
# df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
|
| 222 |
+
# ]
|
| 223 |
+
|
| 224 |
+
# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
|
| 225 |
+
# st.title("Illinois Legislative Trends Q&A")
|
| 226 |
+
# st.markdown("Ask about trends in topics like higher education, funding, etc.")
|
| 227 |
+
|
| 228 |
+
# df = load_data()
|
| 229 |
+
# embed_model, summarizer = load_models()
|
| 230 |
+
|
| 231 |
+
# query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
|
| 232 |
+
|
| 233 |
+
# if query:
|
| 234 |
+
# mon, yr = extract_month_year(query)
|
| 235 |
+
# df2 = extract_topic_match(query, df)
|
| 236 |
+
|
| 237 |
+
# if df2.empty:
|
| 238 |
+
# df2 = df
|
| 239 |
+
# if yr:
|
| 240 |
+
# df2 = df2[df2['status_date'].dt.year == yr]
|
| 241 |
+
# if mon:
|
| 242 |
+
# df2 = df2[df2['status_date'].dt.month == mon]
|
| 243 |
+
# st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
|
| 244 |
+
# else:
|
| 245 |
+
# st.info(f" Filtering by year: **{yr}**")
|
| 246 |
+
|
| 247 |
+
# if df2.empty:
|
| 248 |
+
# st.warning("No matching records found.")
|
| 249 |
+
# else:
|
| 250 |
+
# texts = df2['summary_insight'].tolist()
|
| 251 |
+
# embs = compute_embeddings(texts, _model=embed_model)
|
| 252 |
+
# res = semantic_search(query, embs, embed_model, threshold=0.5)
|
| 253 |
+
|
| 254 |
+
# if not res:
|
| 255 |
+
# st.warning("No relevant insights found.")
|
| 256 |
+
# else:
|
| 257 |
+
# st.subheader("Top Matching Insights")
|
| 258 |
+
# collected = []
|
| 259 |
+
|
| 260 |
+
# for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # increased to 10
|
| 261 |
+
# row = df2.iloc[idx]
|
| 262 |
+
# date = row['status_date'].date()
|
| 263 |
+
# bill_number = row['bill_number']
|
| 264 |
+
# full_url = row['url']
|
| 265 |
+
# cat = row['Category & Subcategory']
|
| 266 |
+
# cat_std = row['category_&_subcategory_standardized2']
|
| 267 |
+
# bene= row['Intended Beneficiaries']
|
| 268 |
+
# bene_std= row['intended_beneficiaries_standardized2']
|
| 269 |
+
# goal = row['Legislative Goal']
|
| 270 |
+
# impact = row['Policy Impact Areas']
|
| 271 |
+
# provision = row['Key Provisions']
|
| 272 |
+
# intent = row['Intent']
|
| 273 |
+
# stance = row['Stance']
|
| 274 |
+
# description = row['description']
|
| 275 |
+
# summary = row['summary']
|
| 276 |
+
|
| 277 |
+
# trend = clean_text(row['llama_trend_summary'])
|
| 278 |
+
# insight = clean_text(row['llama_insight'])
|
| 279 |
+
|
| 280 |
+
# st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
|
| 281 |
+
# st.markdown(f"**Category:** {cat}")
|
| 282 |
+
# # st.markdown(f"**Category Std:** {cat_std}")
|
| 283 |
+
# st.markdown(f"**Intended Beneficiaries:** {bene}")
|
| 284 |
+
# # st.markdown(f"**Intended Beneficiaries STD:** {bene_std}")
|
| 285 |
+
# st.markdown(f"**Goal:** {goal}")
|
| 286 |
+
# st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
|
| 287 |
+
# st.markdown(f"**Policy Impact Area:** {impact}")
|
| 288 |
+
# st.markdown(f"**Key Provision:** {provision}")
|
| 289 |
+
# st.markdown(f"**Description:** {description}")
|
| 290 |
+
# # st.markdown(f"**Summary:** {summary}")
|
| 291 |
+
# st.markdown(f"**Trend Summary:** {trend}")
|
| 292 |
+
# st.markdown(f"**Actionable Insight:** {insight}")
|
| 293 |
+
# st.markdown(f"[View Full Bill Text]({full_url})\n")
|
| 294 |
+
# st.divider()
|
| 295 |
+
|
| 296 |
+
# collected.append(row['summary_insight'])
|
| 297 |
+
|
| 298 |
+
# st.subheader("RAG-Generated Overall Summary")
|
| 299 |
+
# summary = rag_summarize(collected, summarizer)
|
| 300 |
+
# st.success(summary)
|
| 301 |
+
|
| 302 |
import streamlit as st
|
| 303 |
import pandas as pd
|
| 304 |
import re
|
|
|
|
| 363 |
yr = int(ym.group()) if ym else None
|
| 364 |
return mon, yr
|
| 365 |
|
| 366 |
+
def extract_date_range(query):
|
| 367 |
+
"""
|
| 368 |
+
Extracts a start and end month-year from a question like 'from Jan 2024 to May 2025'
|
| 369 |
+
"""
|
| 370 |
+
month_map = {
|
| 371 |
+
"january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
|
| 372 |
+
"july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
|
| 373 |
+
}
|
| 374 |
+
pattern = r"(?i)from\s+([a-zA-Z]+)\s+(\d{4})\s+(to|until)\s+([a-zA-Z]+)\s+(\d{4})"
|
| 375 |
+
match = re.search(pattern, query)
|
| 376 |
+
if match:
|
| 377 |
+
start_month_str, start_year = match.group(1).lower(), int(match.group(2))
|
| 378 |
+
end_month_str, end_year = match.group(4).lower(), int(match.group(5))
|
| 379 |
+
start_month = month_map.get(start_month_str)
|
| 380 |
+
end_month = month_map.get(end_month_str)
|
| 381 |
+
if start_month and end_month:
|
| 382 |
+
start_date = datetime(start_year, start_month, 1)
|
| 383 |
+
end_date = datetime(end_year, end_month, 28) # using 28 to avoid month overflow
|
| 384 |
+
return start_date, end_date
|
| 385 |
+
return None, None
|
| 386 |
+
|
| 387 |
def extract_topic_match(query, df):
|
| 388 |
query_lower = query.lower()
|
| 389 |
return df[
|
|
|
|
| 400 |
df = load_data()
|
| 401 |
embed_model, summarizer = load_models()
|
| 402 |
|
| 403 |
+
query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
|
| 404 |
|
| 405 |
if query:
|
| 406 |
+
start_date, end_date = extract_date_range(query)
|
| 407 |
df2 = extract_topic_match(query, df)
|
| 408 |
|
| 409 |
if df2.empty:
|
| 410 |
df2 = df
|
| 411 |
+
|
| 412 |
+
if start_date and end_date:
|
| 413 |
+
df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
|
| 414 |
+
st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
|
| 415 |
+
else:
|
| 416 |
+
mon, yr = extract_month_year(query)
|
| 417 |
+
if yr:
|
| 418 |
+
df2 = df2[df2['status_date'].dt.year == yr]
|
| 419 |
+
if mon:
|
| 420 |
+
df2 = df2[df2['status_date'].dt.month == mon]
|
| 421 |
+
st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
|
| 422 |
+
else:
|
| 423 |
+
st.info(f"Filtering by year: **{yr}**")
|
| 424 |
|
| 425 |
if df2.empty:
|
| 426 |
st.warning("No matching records found.")
|
|
|
|
| 435 |
st.subheader("Top Matching Insights")
|
| 436 |
collected = []
|
| 437 |
|
| 438 |
+
for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # top 10
|
| 439 |
row = df2.iloc[idx]
|
| 440 |
date = row['status_date'].date()
|
| 441 |
bill_number = row['bill_number']
|
| 442 |
full_url = row['url']
|
| 443 |
cat = row['Category & Subcategory']
|
| 444 |
cat_std = row['category_&_subcategory_standardized2']
|
| 445 |
+
bene = row['Intended Beneficiaries']
|
| 446 |
+
bene_std = row['intended_beneficiaries_standardized2']
|
| 447 |
goal = row['Legislative Goal']
|
| 448 |
impact = row['Policy Impact Areas']
|
| 449 |
provision = row['Key Provisions']
|
|
|
|
| 451 |
stance = row['Stance']
|
| 452 |
description = row['description']
|
| 453 |
summary = row['summary']
|
|
|
|
| 454 |
trend = clean_text(row['llama_trend_summary'])
|
| 455 |
insight = clean_text(row['llama_insight'])
|
| 456 |
|
| 457 |
st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
|
| 458 |
st.markdown(f"**Category:** {cat}")
|
| 459 |
+
st.markdown(f"**Category Std:** {cat_std}")
|
| 460 |
st.markdown(f"**Intended Beneficiaries:** {bene}")
|
| 461 |
+
st.markdown(f"**Intended Beneficiaries STD:** {bene_std}")
|
| 462 |
st.markdown(f"**Goal:** {goal}")
|
| 463 |
st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
|
| 464 |
st.markdown(f"**Policy Impact Area:** {impact}")
|
| 465 |
st.markdown(f"**Key Provision:** {provision}")
|
| 466 |
st.markdown(f"**Description:** {description}")
|
| 467 |
+
st.markdown(f"**Summary:** {summary}")
|
| 468 |
st.markdown(f"**Trend Summary:** {trend}")
|
| 469 |
st.markdown(f"**Actionable Insight:** {insight}")
|
| 470 |
st.markdown(f"[View Full Bill Text]({full_url})\n")
|
|
|
|
| 476 |
summary = rag_summarize(collected, summarizer)
|
| 477 |
st.success(summary)
|
| 478 |
|
|
|