tjl8 commited on
Commit
60e60fd
·
verified ·
1 Parent(s): 122af9c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -21
app.py CHANGED
@@ -302,6 +302,189 @@
302
  #
303
 
304
  # including description
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  import streamlit as st
306
  import pandas as pd
307
  import re
@@ -323,9 +506,20 @@ def load_data():
323
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
324
  df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
325
  df = df.dropna(subset=['status_date'])
326
- df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
327
- df["llama_insight"] = df["llama_insight"].fillna("")
328
- df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
 
 
 
 
 
 
 
 
 
 
 
329
  return df
330
 
331
  @st.cache_resource
@@ -433,8 +627,7 @@ if query:
433
  if df2.empty:
434
  st.warning("No matching records found.")
435
  else:
436
- # Include description in embeddings + RAG
437
- texts = (df2['description'].fillna('') + "\n" + df2['summary_insight'].fillna('')).tolist()
438
  embs = compute_embeddings(texts, _model=embed_model)
439
  res = semantic_search(query, embs, embed_model, threshold=0.5)
440
 
@@ -449,19 +642,14 @@ if query:
449
  date = row['status_date'].date()
450
  bill_number = row['bill_number']
451
  full_url = row['url']
452
- cat = row['Category & Subcategory']
453
- cat_std = row['category_&_subcategory_standardized2']
454
- bene = row['Intended Beneficiaries']
455
- bene_std = row['intended_beneficiaries_standardized2']
456
- goal = row['Legislative Goal']
457
- impact = row['Policy Impact Areas']
458
- provision = row['Key Provisions']
459
- intent = row['Intent']
460
- stance = row['Stance']
461
- description = row['description']
462
- summary = row['summary']
463
- trend = clean_text(row['llama_trend_summary'])
464
- insight = clean_text(row['llama_insight'])
465
 
466
  st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
467
  st.markdown(f"**Category:** {cat}")
@@ -471,15 +659,14 @@ if query:
471
  st.markdown(f"**Policy Impact Area:** {impact}")
472
  st.markdown(f"**Key Provision:** {provision}")
473
  st.markdown(f"**Description:** {description}")
474
- st.markdown(f"**Trend Summary:** {trend}")
475
- st.markdown(f"**Actionable Insight:** {insight}")
476
  st.markdown(f"[View Full Bill Text]({full_url})\n")
477
  st.divider()
478
 
479
- collected.append(description + "\n" + row['summary_insight'])
480
 
481
  st.subheader("RAG-Generated Overall Summary")
482
  summary = rag_summarize(collected, summarizer)
483
  st.success(summary)
484
 
485
 
 
 
302
  #
303
 
304
  # including description
305
+ # import streamlit as st
306
+ # import pandas as pd
307
+ # import re
308
+ # from sentence_transformers import SentenceTransformer
309
+ # from transformers import pipeline
310
+ # from sklearn.metrics.pairwise import cosine_similarity
311
+ # from sklearn.feature_extraction.text import TfidfVectorizer
312
+ # from datetime import datetime
313
+
314
+ # def clean_text(text):
315
+ # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
316
+ # text = re.sub(r"(?i)let me know if you'd like.*", "", text)
317
+ # text = re.sub(r"(?i)trend summary[:]*", "", text)
318
+ # text = re.sub(r"(?i)actionable insight[:]*", "", text)
319
+ # return text.strip()
320
+
321
+ # @st.cache_data
322
+ # def load_data():
323
+ # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
324
+ # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
325
+ # df = df.dropna(subset=['status_date'])
326
+ # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
327
+ # df["llama_insight"] = df["llama_insight"].fillna("")
328
+ # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
329
+ # return df
330
+
331
+ # @st.cache_resource
332
+ # def load_models():
333
+ # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
334
+ # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
335
+ # return embed_model, summarizer
336
+
337
+ # @st.cache_data
338
+ # def compute_embeddings(texts, _model):
339
+ # return _model.encode(texts, show_progress_bar=True)
340
+
341
+ # def semantic_search(query, embeddings, model, threshold=0.5):
342
+ # query_embedding = model.encode([query])
343
+ # sims = cosine_similarity(query_embedding, embeddings)[0]
344
+ # return [(i, s) for i, s in enumerate(sims) if s > threshold]
345
+
346
+ # def rag_summarize(texts, summarizer, top_k=10):
347
+ # if not texts:
348
+ # return "No relevant content to summarize."
349
+ # vect = TfidfVectorizer()
350
+ # m = vect.fit_transform(texts)
351
+ # mean_vec = m.mean(axis=0).A
352
+ # scores = cosine_similarity(mean_vec, m).flatten()
353
+ # top_indices = scores.argsort()[::-1][:top_k]
354
+ # ctx = "\n".join(texts[i] for i in top_indices)
355
+ # prompt = "summarize: " + ctx[:1024]
356
+ # out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
357
+ # return out[0]['summary_text']
358
+
359
+ # def extract_month_year(q):
360
+ # month_map = {m: i for i, m in enumerate(
361
+ # ["january", "february", "march", "april", "may", "june",
362
+ # "july", "august", "september", "october", "november", "december"], 1)}
363
+ # ql = q.lower()
364
+ # mon = next((v for k, v in month_map.items() if k in ql), None)
365
+ # ym = re.search(r"(19|20)\d{2}", q)
366
+ # yr = int(ym.group()) if ym else None
367
+ # return mon, yr
368
+
369
+ # def extract_date_range(query):
370
+ # month_map = {
371
+ # "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
372
+ # "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
373
+ # }
374
+
375
+ # patterns = [
376
+ # r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
377
+ # ]
378
+
379
+ # for pattern in patterns:
380
+ # match = re.search(pattern, query)
381
+ # if match:
382
+ # start_month_str, start_year = match.group(1).lower(), int(match.group(2))
383
+ # end_month_str, end_year = match.group(3).lower(), int(match.group(4))
384
+
385
+ # start_month = month_map.get(start_month_str)
386
+ # end_month = month_map.get(end_month_str)
387
+
388
+ # if start_month and end_month:
389
+ # start_date = datetime(start_year, start_month, 1)
390
+ # end_date = datetime(end_year, end_month, 28)
391
+ # return start_date, end_date
392
+
393
+ # return None, None
394
+
395
+ # def extract_topic_match(query, df):
396
+ # query_lower = query.lower()
397
+ # return df[
398
+ # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
399
+ # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
400
+ # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
401
+ # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
402
+ # ]
403
+
404
+ # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
405
+ # st.title("Illinois Legislative Trends Q&A")
406
+ # st.markdown("Ask about trends in topics like higher education, funding, etc.")
407
+
408
+ # df = load_data()
409
+ # embed_model, summarizer = load_models()
410
+
411
+ # query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
412
+
413
+ # if query:
414
+ # start_date, end_date = extract_date_range(query)
415
+ # df2 = extract_topic_match(query, df)
416
+
417
+ # if df2.empty:
418
+ # df2 = df
419
+
420
+ # if start_date and end_date:
421
+ # df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
422
+ # st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
423
+ # else:
424
+ # mon, yr = extract_month_year(query)
425
+ # if yr:
426
+ # df2 = df2[df2['status_date'].dt.year == yr]
427
+ # if mon:
428
+ # df2 = df2[df2['status_date'].dt.month == mon]
429
+ # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
430
+ # else:
431
+ # st.info(f"Filtering by year: **{yr}**")
432
+
433
+ # if df2.empty:
434
+ # st.warning("No matching records found.")
435
+ # else:
436
+ # # Include description in embeddings + RAG
437
+ # texts = (df2['description'].fillna('') + "\n" + df2['summary_insight'].fillna('')).tolist()
438
+ # embs = compute_embeddings(texts, _model=embed_model)
439
+ # res = semantic_search(query, embs, embed_model, threshold=0.5)
440
+
441
+ # if not res:
442
+ # st.warning("No relevant insights found.")
443
+ # else:
444
+ # st.subheader("Top Matching Insights")
445
+ # collected = []
446
+
447
+ # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
448
+ # row = df2.iloc[idx]
449
+ # date = row['status_date'].date()
450
+ # bill_number = row['bill_number']
451
+ # full_url = row['url']
452
+ # cat = row['Category & Subcategory']
453
+ # cat_std = row['category_&_subcategory_standardized2']
454
+ # bene = row['Intended Beneficiaries']
455
+ # bene_std = row['intended_beneficiaries_standardized2']
456
+ # goal = row['Legislative Goal']
457
+ # impact = row['Policy Impact Areas']
458
+ # provision = row['Key Provisions']
459
+ # intent = row['Intent']
460
+ # stance = row['Stance']
461
+ # description = row['description']
462
+ # summary = row['summary']
463
+ # trend = clean_text(row['llama_trend_summary'])
464
+ # insight = clean_text(row['llama_insight'])
465
+
466
+ # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
467
+ # st.markdown(f"**Category:** {cat}")
468
+ # st.markdown(f"**Intended Beneficiaries:** {bene}")
469
+ # st.markdown(f"**Goal:** {goal}")
470
+ # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
471
+ # st.markdown(f"**Policy Impact Area:** {impact}")
472
+ # st.markdown(f"**Key Provision:** {provision}")
473
+ # st.markdown(f"**Description:** {description}")
474
+ # st.markdown(f"**Trend Summary:** {trend}")
475
+ # st.markdown(f"**Actionable Insight:** {insight}")
476
+ # st.markdown(f"[View Full Bill Text]({full_url})\n")
477
+ # st.divider()
478
+
479
+ # collected.append(description + "\n" + row['summary_insight'])
480
+
481
+ # st.subheader("RAG-Generated Overall Summary")
482
+ # summary = rag_summarize(collected, summarizer)
483
+ # st.success(summary)
484
+
485
+
486
+ ## NEW ONE
487
+
488
  import streamlit as st
489
  import pandas as pd
490
  import re
 
506
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
507
  df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
508
  df = df.dropna(subset=['status_date'])
509
+
510
+ for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
511
+ "Intended Beneficiaries", "Potential Impact", "description"]:
512
+ df[col] = df[col].fillna("")
513
+
514
+ df["combined_text"] = (
515
+ "Legislative Goal: " + df["Legislative Goal"] + "\n" +
516
+ "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
517
+ "Key Provisions: " + df["Key Provisions"] + "\n" +
518
+ "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
519
+ "Potential Impact: " + df["Potential Impact"] + "\n" +
520
+ "Description: " + df["description"]
521
+ )
522
+
523
  return df
524
 
525
  @st.cache_resource
 
627
  if df2.empty:
628
  st.warning("No matching records found.")
629
  else:
630
+ texts = df2['combined_text'].tolist()
 
631
  embs = compute_embeddings(texts, _model=embed_model)
632
  res = semantic_search(query, embs, embed_model, threshold=0.5)
633
 
 
642
  date = row['status_date'].date()
643
  bill_number = row['bill_number']
644
  full_url = row['url']
645
+ cat = row.get('Category & Subcategory', '')
646
+ bene = row.get('Intended Beneficiaries', '')
647
+ goal = row.get('Legislative Goal', '')
648
+ impact = row.get('Policy Impact Areas', '')
649
+ provision = row.get('Key Provisions', '')
650
+ intent = row.get('Intent', '')
651
+ stance = row.get('Stance', '')
652
+ description = row.get('description', '')
 
 
 
 
 
653
 
654
  st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
655
  st.markdown(f"**Category:** {cat}")
 
659
  st.markdown(f"**Policy Impact Area:** {impact}")
660
  st.markdown(f"**Key Provision:** {provision}")
661
  st.markdown(f"**Description:** {description}")
 
 
662
  st.markdown(f"[View Full Bill Text]({full_url})\n")
663
  st.divider()
664
 
665
+ collected.append(row['combined_text'])
666
 
667
  st.subheader("RAG-Generated Overall Summary")
668
  summary = rag_summarize(collected, summarizer)
669
  st.success(summary)
670
 
671
 
672
+