tjl8 commited on
Commit
ebf5928
·
verified ·
1 Parent(s): 6d5ccb1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -17
app.py CHANGED
@@ -148,6 +148,157 @@
148
  # st.subheader("RAG-Generated Overall Summary")
149
  # summary = rag_summarize(collected, summarizer)
150
  # st.success(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  import streamlit as st
152
  import pandas as pd
153
  import re
@@ -212,6 +363,27 @@ def extract_month_year(q):
212
  yr = int(ym.group()) if ym else None
213
  return mon, yr
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  def extract_topic_match(query, df):
216
  query_lower = query.lower()
217
  return df[
@@ -228,21 +400,27 @@ st.markdown("Ask about trends in topics like higher education, funding, etc.")
228
  df = load_data()
229
  embed_model, summarizer = load_models()
230
 
231
- query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
232
 
233
  if query:
234
- mon, yr = extract_month_year(query)
235
  df2 = extract_topic_match(query, df)
236
 
237
  if df2.empty:
238
  df2 = df
239
- if yr:
240
- df2 = df2[df2['status_date'].dt.year == yr]
241
- if mon:
242
- df2 = df2[df2['status_date'].dt.month == mon]
243
- st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
244
- else:
245
- st.info(f" Filtering by year: **{yr}**")
 
 
 
 
 
 
246
 
247
  if df2.empty:
248
  st.warning("No matching records found.")
@@ -257,15 +435,15 @@ if query:
257
  st.subheader("Top Matching Insights")
258
  collected = []
259
 
260
- for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # increased to 10
261
  row = df2.iloc[idx]
262
  date = row['status_date'].date()
263
  bill_number = row['bill_number']
264
  full_url = row['url']
265
  cat = row['Category & Subcategory']
266
  cat_std = row['category_&_subcategory_standardized2']
267
- bene= row['Intended Beneficiaries']
268
- bene_std= row['intended_beneficiaries_standardized2']
269
  goal = row['Legislative Goal']
270
  impact = row['Policy Impact Areas']
271
  provision = row['Key Provisions']
@@ -273,21 +451,20 @@ if query:
273
  stance = row['Stance']
274
  description = row['description']
275
  summary = row['summary']
276
-
277
  trend = clean_text(row['llama_trend_summary'])
278
  insight = clean_text(row['llama_insight'])
279
 
280
  st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
281
  st.markdown(f"**Category:** {cat}")
282
- # st.markdown(f"**Category Std:** {cat_std}")
283
  st.markdown(f"**Intended Beneficiaries:** {bene}")
284
- # st.markdown(f"**Intended Beneficiaries STD:** {bene_std}")
285
  st.markdown(f"**Goal:** {goal}")
286
  st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
287
  st.markdown(f"**Policy Impact Area:** {impact}")
288
  st.markdown(f"**Key Provision:** {provision}")
289
  st.markdown(f"**Description:** {description}")
290
- # st.markdown(f"**Summary:** {summary}")
291
  st.markdown(f"**Trend Summary:** {trend}")
292
  st.markdown(f"**Actionable Insight:** {insight}")
293
  st.markdown(f"[View Full Bill Text]({full_url})\n")
@@ -299,4 +476,3 @@ if query:
299
  summary = rag_summarize(collected, summarizer)
300
  st.success(summary)
301
 
302
-
 
148
  # st.subheader("RAG-Generated Overall Summary")
149
  # summary = rag_summarize(collected, summarizer)
150
  # st.success(summary)
151
+ # import streamlit as st
152
+ # import pandas as pd
153
+ # import re
154
+ # from sentence_transformers import SentenceTransformer
155
+ # from transformers import pipeline
156
+ # from sklearn.metrics.pairwise import cosine_similarity
157
+ # from sklearn.feature_extraction.text import TfidfVectorizer
158
+ # from datetime import datetime
159
+
160
+ # def clean_text(text):
161
+ # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
162
+ # text = re.sub(r"(?i)let me know if you'd like.*", "", text)
163
+ # text = re.sub(r"(?i)trend summary[:]*", "", text)
164
+ # text = re.sub(r"(?i)actionable insight[:]*", "", text)
165
+ # return text.strip()
166
+
167
+ # @st.cache_data
168
+ # def load_data():
169
+ # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
170
+ # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
171
+ # df = df.dropna(subset=['status_date'])
172
+ # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
173
+ # df["llama_insight"] = df["llama_insight"].fillna("")
174
+ # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
175
+ # return df
176
+
177
+ # @st.cache_resource
178
+ # def load_models():
179
+ # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
180
+ # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
181
+ # return embed_model, summarizer
182
+
183
+ # @st.cache_data
184
+ # def compute_embeddings(texts, _model):
185
+ # return _model.encode(texts, show_progress_bar=True)
186
+
187
+ # def semantic_search(query, embeddings, model, threshold=0.5):
188
+ # query_embedding = model.encode([query])
189
+ # sims = cosine_similarity(query_embedding, embeddings)[0]
190
+ # return [(i, s) for i, s in enumerate(sims) if s > threshold]
191
+
192
+ # def rag_summarize(texts, summarizer, top_k=10): # increased from 5 to 10
193
+ # if not texts:
194
+ # return "No relevant content to summarize."
195
+ # vect = TfidfVectorizer()
196
+ # m = vect.fit_transform(texts)
197
+ # mean_vec = m.mean(axis=0).A
198
+ # scores = cosine_similarity(mean_vec, m).flatten()
199
+ # top_indices = scores.argsort()[::-1][:top_k]
200
+ # ctx = "\n".join(texts[i] for i in top_indices)
201
+ # prompt = "summarize: " + ctx[:1024]
202
+ # out = summarizer(prompt, max_length=150, min_length=80, do_sample=False) # updated length
203
+ # return out[0]['summary_text']
204
+
205
+ # def extract_month_year(q):
206
+ # month_map = {m: i for i, m in enumerate(
207
+ # ["january", "february", "march", "april", "may", "june",
208
+ # "july", "august", "september", "october", "november", "december"], 1)}
209
+ # ql = q.lower()
210
+ # mon = next((v for k, v in month_map.items() if k in ql), None)
211
+ # ym = re.search(r"(19|20)\d{2}", q)
212
+ # yr = int(ym.group()) if ym else None
213
+ # return mon, yr
214
+
215
+ # def extract_topic_match(query, df):
216
+ # query_lower = query.lower()
217
+ # return df[
218
+ # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
219
+ # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
220
+ # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
221
+ # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
222
+ # ]
223
+
224
+ # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
225
+ # st.title("Illinois Legislative Trends Q&A")
226
+ # st.markdown("Ask about trends in topics like higher education, funding, etc.")
227
+
228
+ # df = load_data()
229
+ # embed_model, summarizer = load_models()
230
+
231
+ # query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
232
+
233
+ # if query:
234
+ # mon, yr = extract_month_year(query)
235
+ # df2 = extract_topic_match(query, df)
236
+
237
+ # if df2.empty:
238
+ # df2 = df
239
+ # if yr:
240
+ # df2 = df2[df2['status_date'].dt.year == yr]
241
+ # if mon:
242
+ # df2 = df2[df2['status_date'].dt.month == mon]
243
+ # st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
244
+ # else:
245
+ # st.info(f" Filtering by year: **{yr}**")
246
+
247
+ # if df2.empty:
248
+ # st.warning("No matching records found.")
249
+ # else:
250
+ # texts = df2['summary_insight'].tolist()
251
+ # embs = compute_embeddings(texts, _model=embed_model)
252
+ # res = semantic_search(query, embs, embed_model, threshold=0.5)
253
+
254
+ # if not res:
255
+ # st.warning("No relevant insights found.")
256
+ # else:
257
+ # st.subheader("Top Matching Insights")
258
+ # collected = []
259
+
260
+ # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # increased to 10
261
+ # row = df2.iloc[idx]
262
+ # date = row['status_date'].date()
263
+ # bill_number = row['bill_number']
264
+ # full_url = row['url']
265
+ # cat = row['Category & Subcategory']
266
+ # cat_std = row['category_&_subcategory_standardized2']
267
+ # bene= row['Intended Beneficiaries']
268
+ # bene_std= row['intended_beneficiaries_standardized2']
269
+ # goal = row['Legislative Goal']
270
+ # impact = row['Policy Impact Areas']
271
+ # provision = row['Key Provisions']
272
+ # intent = row['Intent']
273
+ # stance = row['Stance']
274
+ # description = row['description']
275
+ # summary = row['summary']
276
+
277
+ # trend = clean_text(row['llama_trend_summary'])
278
+ # insight = clean_text(row['llama_insight'])
279
+
280
+ # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
281
+ # st.markdown(f"**Category:** {cat}")
282
+ # # st.markdown(f"**Category Std:** {cat_std}")
283
+ # st.markdown(f"**Intended Beneficiaries:** {bene}")
284
+ # # st.markdown(f"**Intended Beneficiaries STD:** {bene_std}")
285
+ # st.markdown(f"**Goal:** {goal}")
286
+ # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
287
+ # st.markdown(f"**Policy Impact Area:** {impact}")
288
+ # st.markdown(f"**Key Provision:** {provision}")
289
+ # st.markdown(f"**Description:** {description}")
290
+ # # st.markdown(f"**Summary:** {summary}")
291
+ # st.markdown(f"**Trend Summary:** {trend}")
292
+ # st.markdown(f"**Actionable Insight:** {insight}")
293
+ # st.markdown(f"[View Full Bill Text]({full_url})\n")
294
+ # st.divider()
295
+
296
+ # collected.append(row['summary_insight'])
297
+
298
+ # st.subheader("RAG-Generated Overall Summary")
299
+ # summary = rag_summarize(collected, summarizer)
300
+ # st.success(summary)
301
+
302
  import streamlit as st
303
  import pandas as pd
304
  import re
 
363
  yr = int(ym.group()) if ym else None
364
  return mon, yr
365
 
366
+ def extract_date_range(query):
367
+ """
368
+ Extracts a start and end month-year from a question like 'from Jan 2024 to May 2025'
369
+ """
370
+ month_map = {
371
+ "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
372
+ "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
373
+ }
374
+ pattern = r"(?i)from\s+([a-zA-Z]+)\s+(\d{4})\s+(to|until)\s+([a-zA-Z]+)\s+(\d{4})"
375
+ match = re.search(pattern, query)
376
+ if match:
377
+ start_month_str, start_year = match.group(1).lower(), int(match.group(2))
378
+ end_month_str, end_year = match.group(4).lower(), int(match.group(5))
379
+ start_month = month_map.get(start_month_str)
380
+ end_month = month_map.get(end_month_str)
381
+ if start_month and end_month:
382
+ start_date = datetime(start_year, start_month, 1)
383
+ end_date = datetime(end_year, end_month, 28) # using 28 to avoid month overflow
384
+ return start_date, end_date
385
+ return None, None
386
+
387
  def extract_topic_match(query, df):
388
  query_lower = query.lower()
389
  return df[
 
400
  df = load_data()
401
  embed_model, summarizer = load_models()
402
 
403
+ query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
404
 
405
  if query:
406
+ start_date, end_date = extract_date_range(query)
407
  df2 = extract_topic_match(query, df)
408
 
409
  if df2.empty:
410
  df2 = df
411
+
412
+ if start_date and end_date:
413
+ df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
414
+ st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
415
+ else:
416
+ mon, yr = extract_month_year(query)
417
+ if yr:
418
+ df2 = df2[df2['status_date'].dt.year == yr]
419
+ if mon:
420
+ df2 = df2[df2['status_date'].dt.month == mon]
421
+ st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
422
+ else:
423
+ st.info(f"Filtering by year: **{yr}**")
424
 
425
  if df2.empty:
426
  st.warning("No matching records found.")
 
435
  st.subheader("Top Matching Insights")
436
  collected = []
437
 
438
+ for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # top 10
439
  row = df2.iloc[idx]
440
  date = row['status_date'].date()
441
  bill_number = row['bill_number']
442
  full_url = row['url']
443
  cat = row['Category & Subcategory']
444
  cat_std = row['category_&_subcategory_standardized2']
445
+ bene = row['Intended Beneficiaries']
446
+ bene_std = row['intended_beneficiaries_standardized2']
447
  goal = row['Legislative Goal']
448
  impact = row['Policy Impact Areas']
449
  provision = row['Key Provisions']
 
451
  stance = row['Stance']
452
  description = row['description']
453
  summary = row['summary']
 
454
  trend = clean_text(row['llama_trend_summary'])
455
  insight = clean_text(row['llama_insight'])
456
 
457
  st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
458
  st.markdown(f"**Category:** {cat}")
459
+ st.markdown(f"**Category Std:** {cat_std}")
460
  st.markdown(f"**Intended Beneficiaries:** {bene}")
461
+ st.markdown(f"**Intended Beneficiaries STD:** {bene_std}")
462
  st.markdown(f"**Goal:** {goal}")
463
  st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
464
  st.markdown(f"**Policy Impact Area:** {impact}")
465
  st.markdown(f"**Key Provision:** {provision}")
466
  st.markdown(f"**Description:** {description}")
467
+ st.markdown(f"**Summary:** {summary}")
468
  st.markdown(f"**Trend Summary:** {trend}")
469
  st.markdown(f"**Actionable Insight:** {insight}")
470
  st.markdown(f"[View Full Bill Text]({full_url})\n")
 
476
  summary = rag_summarize(collected, summarizer)
477
  st.success(summary)
478