tjl8 commited on
Commit
30b5df8
·
verified ·
1 Parent(s): 2566043

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -1
app.py CHANGED
@@ -485,6 +485,196 @@
485
 
486
  ## NEW ONE
487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  import streamlit as st
489
  import pandas as pd
490
  import re
@@ -525,7 +715,8 @@ def load_data():
525
  @st.cache_resource
526
  def load_models():
527
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
528
- summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
 
529
  return embed_model, summarizer
530
 
531
  @st.cache_data
@@ -674,3 +865,4 @@ if query:
674
 
675
 
676
 
 
 
485
 
486
  ## NEW ONE
487
 
488
+ # import streamlit as st
489
+ # import pandas as pd
490
+ # import re
491
+ # from sentence_transformers import SentenceTransformer
492
+ # from transformers import pipeline
493
+ # from sklearn.metrics.pairwise import cosine_similarity
494
+ # from sklearn.feature_extraction.text import TfidfVectorizer
495
+ # from datetime import datetime
496
+
497
+ # def clean_text(text):
498
+ # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
499
+ # text = re.sub(r"(?i)let me know if you'd like.*", "", text)
500
+ # text = re.sub(r"(?i)trend summary[:]*", "", text)
501
+ # text = re.sub(r"(?i)actionable insight[:]*", "", text)
502
+ # return text.strip()
503
+
504
+ # @st.cache_data
505
+ # def load_data():
506
+ # df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
507
+ # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
508
+ # df = df.dropna(subset=['status_date'])
509
+
510
+ # for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
511
+ # "Intended Beneficiaries", "Potential Impact", "description"]:
512
+ # df[col] = df[col].fillna("")
513
+
514
+ # df["combined_text"] = (
515
+ # "Legislative Goal: " + df["Legislative Goal"] + "\n" +
516
+ # "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
517
+ # "Key Provisions: " + df["Key Provisions"] + "\n" +
518
+ # "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
519
+ # "Potential Impact: " + df["Potential Impact"] + "\n" +
520
+ # "Description: " + df["description"]
521
+ # )
522
+
523
+ # return df
524
+
525
+ # @st.cache_resource
526
+ # def load_models():
527
+ # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
528
+ # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
529
+ # return embed_model, summarizer
530
+
531
+ # @st.cache_data
532
+ # def compute_embeddings(texts, _model):
533
+ # return _model.encode(texts, show_progress_bar=True)
534
+
535
+ # def semantic_search(query, embeddings, model, threshold=0.5):
536
+ # query_embedding = model.encode([query])
537
+ # sims = cosine_similarity(query_embedding, embeddings)[0]
538
+ # return [(i, s) for i, s in enumerate(sims) if s > threshold]
539
+
540
+ # def rag_summarize(texts, summarizer, top_k=5):
541
+ # if not texts:
542
+ # return "No relevant content to summarize."
543
+ # vect = TfidfVectorizer()
544
+ # m = vect.fit_transform(texts)
545
+ # mean_vec = m.mean(axis=0).A
546
+ # scores = cosine_similarity(mean_vec, m).flatten()
547
+ # top_indices = scores.argsort()[::-1][:top_k]
548
+ # ctx = "\n".join(texts[i] for i in top_indices)
549
+ # prompt = "summarize: " + ctx[:1024]
550
+ # out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
551
+ # return out[0]['summary_text']
552
+
553
+ # def extract_month_year(q):
554
+ # month_map = {m: i for i, m in enumerate(
555
+ # ["january", "february", "march", "april", "may", "june",
556
+ # "july", "august", "september", "october", "november", "december"], 1)}
557
+ # ql = q.lower()
558
+ # mon = next((v for k, v in month_map.items() if k in ql), None)
559
+ # ym = re.search(r"(19|20)\d{2}", q)
560
+ # yr = int(ym.group()) if ym else None
561
+ # return mon, yr
562
+
563
+ # def extract_date_range(query):
564
+ # month_map = {
565
+ # "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
566
+ # "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
567
+ # }
568
+
569
+ # patterns = [
570
+ # r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
571
+ # ]
572
+
573
+ # for pattern in patterns:
574
+ # match = re.search(pattern, query)
575
+ # if match:
576
+ # start_month_str, start_year = match.group(1).lower(), int(match.group(2))
577
+ # end_month_str, end_year = match.group(3).lower(), int(match.group(4))
578
+
579
+ # start_month = month_map.get(start_month_str)
580
+ # end_month = month_map.get(end_month_str)
581
+
582
+ # if start_month and end_month:
583
+ # start_date = datetime(start_year, start_month, 1)
584
+ # end_date = datetime(end_year, end_month, 28)
585
+ # return start_date, end_date
586
+
587
+ # return None, None
588
+
589
+
590
+ # def extract_topic_match(query, df):
591
+ # query_lower = query.lower()
592
+ # return df[
593
+ # df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
594
+ # df['Intent'].fillna('').str.lower().str.contains(query_lower) |
595
+ # df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
596
+ # df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
597
+ # df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
598
+ # df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
599
+ # ]
600
+
601
+
602
+ # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
603
+ # st.title("Illinois Legislative Trends Q&A")
604
+ # st.markdown("Ask about trends in topics like higher education, funding, etc.")
605
+
606
+ # df = load_data()
607
+ # embed_model, summarizer = load_models()
608
+
609
+ # query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
610
+
611
+ # if query:
612
+ # start_date, end_date = extract_date_range(query)
613
+ # df2 = extract_topic_match(query, df)
614
+
615
+ # if df2.empty:
616
+ # df2 = df
617
+
618
+ # if start_date and end_date:
619
+ # df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
620
+ # st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
621
+ # else:
622
+ # mon, yr = extract_month_year(query)
623
+ # if yr:
624
+ # df2 = df2[df2['status_date'].dt.year == yr]
625
+ # if mon:
626
+ # df2 = df2[df2['status_date'].dt.month == mon]
627
+ # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
628
+ # else:
629
+ # st.info(f"Filtering by year: **{yr}**")
630
+
631
+ # if df2.empty:
632
+ # st.warning("No matching records found.")
633
+ # else:
634
+ # texts = df2['combined_text'].tolist()
635
+ # embs = compute_embeddings(texts, _model=embed_model)
636
+ # res = semantic_search(query, embs, embed_model, threshold=0.5)
637
+
638
+ # if not res:
639
+ # st.warning("No relevant insights found.")
640
+ # else:
641
+ # st.subheader("Top Matching Insights")
642
+ # collected = []
643
+
644
+ # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
645
+ # row = df2.iloc[idx]
646
+ # date = row['status_date'].date()
647
+ # bill_number = row['bill_number']
648
+ # full_url = row['url']
649
+ # cat = row.get('Category & Subcategory', '')
650
+ # bene = row.get('Intended Beneficiaries', '')
651
+ # goal = row.get('Legislative Goal', '')
652
+ # impact = row.get('Policy Impact Areas', '')
653
+ # provision = row.get('Key Provisions', '')
654
+ # intent = row.get('Intent', '')
655
+ # stance = row.get('Stance', '')
656
+ # description = row.get('description', '')
657
+
658
+ # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
659
+ # st.markdown(f"**Category:** {cat}")
660
+ # st.markdown(f"**Intended Beneficiaries:** {bene}")
661
+ # st.markdown(f"**Goal:** {goal}")
662
+ # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
663
+ # st.markdown(f"**Policy Impact Area:** {impact}")
664
+ # st.markdown(f"**Key Provision:** {provision}")
665
+ # st.markdown(f"**Description:** {description}")
666
+ # st.markdown(f"[View Full Bill Text]({full_url})\n")
667
+ # st.divider()
668
+
669
+ # collected.append(row['combined_text'])
670
+
671
+ # st.subheader("RAG-Generated Overall Summary")
672
+ # summary = rag_summarize(collected, summarizer)
673
+ # st.success(summary)
674
+
675
+
676
+ #BART
677
+
678
  import streamlit as st
679
  import pandas as pd
680
  import re
 
715
  @st.cache_resource
716
  def load_models():
717
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
718
+ # Changed summarization model to facebook/bart-large-cnn for better summary quality
719
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
720
  return embed_model, summarizer
721
 
722
  @st.cache_data
 
865
 
866
 
867
 
868
+