tjl8 commited on
Commit
54fe714
·
verified ·
1 Parent(s): 30b5df8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +224 -49
app.py CHANGED
@@ -675,6 +675,198 @@
675
 
676
  #BART
677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
678
  import streamlit as st
679
  import pandas as pd
680
  import re
@@ -715,7 +907,6 @@ def load_data():
715
  @st.cache_resource
716
  def load_models():
717
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
718
- # Changed summarization model to facebook/bart-large-cnn for better summary quality
719
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
720
  return embed_model, summarizer
721
 
@@ -723,12 +914,7 @@ def load_models():
723
  def compute_embeddings(texts, _model):
724
  return _model.encode(texts, show_progress_bar=True)
725
 
726
- def semantic_search(query, embeddings, model, threshold=0.5):
727
- query_embedding = model.encode([query])
728
- sims = cosine_similarity(query_embedding, embeddings)[0]
729
- return [(i, s) for i, s in enumerate(sims) if s > threshold]
730
-
731
- def rag_summarize(texts, summarizer, top_k=5):
732
  if not texts:
733
  return "No relevant content to summarize."
734
  vect = TfidfVectorizer()
@@ -738,7 +924,7 @@ def rag_summarize(texts, summarizer, top_k=5):
738
  top_indices = scores.argsort()[::-1][:top_k]
739
  ctx = "\n".join(texts[i] for i in top_indices)
740
  prompt = "summarize: " + ctx[:1024]
741
- out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
742
  return out[0]['summary_text']
743
 
744
  def extract_month_year(q):
@@ -777,7 +963,6 @@ def extract_date_range(query):
777
 
778
  return None, None
779
 
780
-
781
  def extract_topic_match(query, df):
782
  query_lower = query.lower()
783
  return df[
@@ -822,46 +1007,36 @@ if query:
822
  if df2.empty:
823
  st.warning("No matching records found.")
824
  else:
825
- texts = df2['combined_text'].tolist()
826
- embs = compute_embeddings(texts, _model=embed_model)
827
- res = semantic_search(query, embs, embed_model, threshold=0.5)
828
-
829
- if not res:
830
- st.warning("No relevant insights found.")
831
- else:
832
- st.subheader("Top Matching Insights")
833
- collected = []
834
-
835
- for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
836
- row = df2.iloc[idx]
837
- date = row['status_date'].date()
838
- bill_number = row['bill_number']
839
- full_url = row['url']
840
- cat = row.get('Category & Subcategory', '')
841
- bene = row.get('Intended Beneficiaries', '')
842
- goal = row.get('Legislative Goal', '')
843
- impact = row.get('Policy Impact Areas', '')
844
- provision = row.get('Key Provisions', '')
845
- intent = row.get('Intent', '')
846
- stance = row.get('Stance', '')
847
- description = row.get('description', '')
848
-
849
- st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
850
- st.markdown(f"**Category:** {cat}")
851
- st.markdown(f"**Intended Beneficiaries:** {bene}")
852
- st.markdown(f"**Goal:** {goal}")
853
- st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
854
- st.markdown(f"**Policy Impact Area:** {impact}")
855
- st.markdown(f"**Key Provision:** {provision}")
856
- st.markdown(f"**Description:** {description}")
857
- st.markdown(f"[View Full Bill Text]({full_url})\n")
858
- st.divider()
859
-
860
- collected.append(row['combined_text'])
861
-
862
- st.subheader("RAG-Generated Overall Summary")
863
- summary = rag_summarize(collected, summarizer)
864
- st.success(summary)
865
 
866
 
867
 
 
675
 
676
  #BART
677
 
678
+ # import streamlit as st
679
+ # import pandas as pd
680
+ # import re
681
+ # from sentence_transformers import SentenceTransformer
682
+ # from transformers import pipeline
683
+ # from sklearn.metrics.pairwise import cosine_similarity
684
+ # from sklearn.feature_extraction.text import TfidfVectorizer
685
+ # from datetime import datetime
686
+
687
+ # def clean_text(text):
688
+ # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
689
+ # text = re.sub(r"(?i)let me know if you'd like.*", "", text)
690
+ # text = re.sub(r"(?i)trend summary[:]*", "", text)
691
+ # text = re.sub(r"(?i)actionable insight[:]*", "", text)
692
+ # return text.strip()
693
+
694
+ # @st.cache_data
695
+ # def load_data():
696
+ # df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
697
+ # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
698
+ # df = df.dropna(subset=['status_date'])
699
+
700
+ # for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
701
+ # "Intended Beneficiaries", "Potential Impact", "description"]:
702
+ # df[col] = df[col].fillna("")
703
+
704
+ # df["combined_text"] = (
705
+ # "Legislative Goal: " + df["Legislative Goal"] + "\n" +
706
+ # "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
707
+ # "Key Provisions: " + df["Key Provisions"] + "\n" +
708
+ # "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
709
+ # "Potential Impact: " + df["Potential Impact"] + "\n" +
710
+ # "Description: " + df["description"]
711
+ # )
712
+
713
+ # return df
714
+
715
+ # @st.cache_resource
716
+ # def load_models():
717
+ # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
718
+ # # Changed summarization model to facebook/bart-large-cnn for better summary quality
719
+ # summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
720
+ # return embed_model, summarizer
721
+
722
+ # @st.cache_data
723
+ # def compute_embeddings(texts, _model):
724
+ # return _model.encode(texts, show_progress_bar=True)
725
+
726
+ # def semantic_search(query, embeddings, model, threshold=0.5):
727
+ # query_embedding = model.encode([query])
728
+ # sims = cosine_similarity(query_embedding, embeddings)[0]
729
+ # return [(i, s) for i, s in enumerate(sims) if s > threshold]
730
+
731
+ # def rag_summarize(texts, summarizer, top_k=5):
732
+ # if not texts:
733
+ # return "No relevant content to summarize."
734
+ # vect = TfidfVectorizer()
735
+ # m = vect.fit_transform(texts)
736
+ # mean_vec = m.mean(axis=0).A
737
+ # scores = cosine_similarity(mean_vec, m).flatten()
738
+ # top_indices = scores.argsort()[::-1][:top_k]
739
+ # ctx = "\n".join(texts[i] for i in top_indices)
740
+ # prompt = "summarize: " + ctx[:1024]
741
+ # out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
742
+ # return out[0]['summary_text']
743
+
744
+ # def extract_month_year(q):
745
+ # month_map = {m: i for i, m in enumerate(
746
+ # ["january", "february", "march", "april", "may", "june",
747
+ # "july", "august", "september", "october", "november", "december"], 1)}
748
+ # ql = q.lower()
749
+ # mon = next((v for k, v in month_map.items() if k in ql), None)
750
+ # ym = re.search(r"(19|20)\d{2}", q)
751
+ # yr = int(ym.group()) if ym else None
752
+ # return mon, yr
753
+
754
+ # def extract_date_range(query):
755
+ # month_map = {
756
+ # "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
757
+ # "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
758
+ # }
759
+
760
+ # patterns = [
761
+ # r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
762
+ # ]
763
+
764
+ # for pattern in patterns:
765
+ # match = re.search(pattern, query)
766
+ # if match:
767
+ # start_month_str, start_year = match.group(1).lower(), int(match.group(2))
768
+ # end_month_str, end_year = match.group(3).lower(), int(match.group(4))
769
+
770
+ # start_month = month_map.get(start_month_str)
771
+ # end_month = month_map.get(end_month_str)
772
+
773
+ # if start_month and end_month:
774
+ # start_date = datetime(start_year, start_month, 1)
775
+ # end_date = datetime(end_year, end_month, 28)
776
+ # return start_date, end_date
777
+
778
+ # return None, None
779
+
780
+
781
+ # def extract_topic_match(query, df):
782
+ # query_lower = query.lower()
783
+ # return df[
784
+ # df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
785
+ # df['Intent'].fillna('').str.lower().str.contains(query_lower) |
786
+ # df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
787
+ # df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
788
+ # df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
789
+ # df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
790
+ # ]
791
+
792
+
793
+ # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
794
+ # st.title("Illinois Legislative Trends Q&A")
795
+ # st.markdown("Ask about trends in topics like higher education, funding, etc.")
796
+
797
+ # df = load_data()
798
+ # embed_model, summarizer = load_models()
799
+
800
+ # query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
801
+
802
+ # if query:
803
+ # start_date, end_date = extract_date_range(query)
804
+ # df2 = extract_topic_match(query, df)
805
+
806
+ # if df2.empty:
807
+ # df2 = df
808
+
809
+ # if start_date and end_date:
810
+ # df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
811
+ # st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
812
+ # else:
813
+ # mon, yr = extract_month_year(query)
814
+ # if yr:
815
+ # df2 = df2[df2['status_date'].dt.year == yr]
816
+ # if mon:
817
+ # df2 = df2[df2['status_date'].dt.month == mon]
818
+ # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
819
+ # else:
820
+ # st.info(f"Filtering by year: **{yr}**")
821
+
822
+ # if df2.empty:
823
+ # st.warning("No matching records found.")
824
+ # else:
825
+ # texts = df2['combined_text'].tolist()
826
+ # embs = compute_embeddings(texts, _model=embed_model)
827
+ # res = semantic_search(query, embs, embed_model, threshold=0.5)
828
+
829
+ # if not res:
830
+ # st.warning("No relevant insights found.")
831
+ # else:
832
+ # st.subheader("Top Matching Insights")
833
+ # collected = []
834
+
835
+ # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
836
+ # row = df2.iloc[idx]
837
+ # date = row['status_date'].date()
838
+ # bill_number = row['bill_number']
839
+ # full_url = row['url']
840
+ # cat = row.get('Category & Subcategory', '')
841
+ # bene = row.get('Intended Beneficiaries', '')
842
+ # goal = row.get('Legislative Goal', '')
843
+ # impact = row.get('Policy Impact Areas', '')
844
+ # provision = row.get('Key Provisions', '')
845
+ # intent = row.get('Intent', '')
846
+ # stance = row.get('Stance', '')
847
+ # description = row.get('description', '')
848
+
849
+ # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
850
+ # st.markdown(f"**Category:** {cat}")
851
+ # st.markdown(f"**Intended Beneficiaries:** {bene}")
852
+ # st.markdown(f"**Goal:** {goal}")
853
+ # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
854
+ # st.markdown(f"**Policy Impact Area:** {impact}")
855
+ # st.markdown(f"**Key Provision:** {provision}")
856
+ # st.markdown(f"**Description:** {description}")
857
+ # st.markdown(f"[View Full Bill Text]({full_url})\n")
858
+ # st.divider()
859
+
860
+ # collected.append(row['combined_text'])
861
+
862
+ # st.subheader("RAG-Generated Overall Summary")
863
+ # summary = rag_summarize(collected, summarizer)
864
+ # st.success(summary)
865
+
866
+
867
+
868
+ #bartv2 - trying to make summary better
869
+
870
  import streamlit as st
871
  import pandas as pd
872
  import re
 
907
  @st.cache_resource
908
  def load_models():
909
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
 
910
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
911
  return embed_model, summarizer
912
 
 
914
  def compute_embeddings(texts, _model):
915
  return _model.encode(texts, show_progress_bar=True)
916
 
917
+ def rag_summarize(texts, summarizer, top_k=10):
 
 
 
 
 
918
  if not texts:
919
  return "No relevant content to summarize."
920
  vect = TfidfVectorizer()
 
924
  top_indices = scores.argsort()[::-1][:top_k]
925
  ctx = "\n".join(texts[i] for i in top_indices)
926
  prompt = "summarize: " + ctx[:1024]
927
+ out = summarizer(prompt, max_length=300, min_length=100, do_sample=False)
928
  return out[0]['summary_text']
929
 
930
  def extract_month_year(q):
 
963
 
964
  return None, None
965
 
 
966
  def extract_topic_match(query, df):
967
  query_lower = query.lower()
968
  return df[
 
1007
  if df2.empty:
1008
  st.warning("No matching records found.")
1009
  else:
1010
+ st.subheader("Top Matching Bills")
1011
+ for _, row in df2.iterrows():
1012
+ date = row['status_date'].date()
1013
+ bill_number = row['bill_number']
1014
+ full_url = row['url']
1015
+ cat = row.get('Category & Subcategory', '')
1016
+ bene = row.get('Intended Beneficiaries', '')
1017
+ goal = row.get('Legislative Goal', '')
1018
+ impact = row.get('Policy Impact Areas', '')
1019
+ provision = row.get('Key Provisions', '')
1020
+ intent = row.get('Intent', '')
1021
+ stance = row.get('Stance', '')
1022
+ description = row.get('description', '')
1023
+
1024
+ st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number}")
1025
+ st.markdown(f"**Category:** {cat}")
1026
+ st.markdown(f"**Intended Beneficiaries:** {bene}")
1027
+ st.markdown(f"**Goal:** {goal}")
1028
+ st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
1029
+ st.markdown(f"**Policy Impact Area:** {impact}")
1030
+ st.markdown(f"**Key Provision:** {provision}")
1031
+ st.markdown(f"**Description:** {description}")
1032
+ st.markdown(f"[View Full Bill Text]({full_url})\n")
1033
+ st.divider()
1034
+
1035
+ st.subheader("RAG-Generated Overall Summary of All Matching Bills")
1036
+ all_texts = df2['combined_text'].tolist()
1037
+ summary = rag_summarize(all_texts, summarizer, top_k=15)
1038
+ st.success(summary)
1039
+
 
 
 
 
 
 
 
 
 
 
1040
 
1041
 
1042