tjl8 commited on
Commit
65e5177
·
verified ·
1 Parent(s): a64bf13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -187
app.py CHANGED
@@ -674,194 +674,194 @@
674
 
675
 
676
  #BART
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677
 
678
- import streamlit as st
679
- import pandas as pd
680
- import re
681
- from sentence_transformers import SentenceTransformer
682
- from transformers import pipeline
683
- from sklearn.metrics.pairwise import cosine_similarity
684
- from sklearn.feature_extraction.text import TfidfVectorizer
685
- from datetime import datetime
686
-
687
- def clean_text(text):
688
- text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
689
- text = re.sub(r"(?i)let me know if you'd like.*", "", text)
690
- text = re.sub(r"(?i)trend summary[:]*", "", text)
691
- text = re.sub(r"(?i)actionable insight[:]*", "", text)
692
- return text.strip()
693
-
694
- @st.cache_data
695
- def load_data():
696
- df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
697
- df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
698
- df = df.dropna(subset=['status_date'])
699
-
700
- for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
701
- "Intended Beneficiaries", "Potential Impact", "description"]:
702
- df[col] = df[col].fillna("")
703
-
704
- df["combined_text"] = (
705
- "Legislative Goal: " + df["Legislative Goal"] + "\n" +
706
- "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
707
- "Key Provisions: " + df["Key Provisions"] + "\n" +
708
- "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
709
- "Potential Impact: " + df["Potential Impact"] + "\n" +
710
- "Description: " + df["description"]
711
- )
712
-
713
- return df
714
-
715
- @st.cache_resource
716
- def load_models():
717
- embed_model = SentenceTransformer('all-MiniLM-L6-v2')
718
- # Changed summarization model to facebook/bart-large-cnn for better summary quality
719
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
720
- return embed_model, summarizer
721
-
722
- @st.cache_data
723
- def compute_embeddings(texts, _model):
724
- return _model.encode(texts, show_progress_bar=True)
725
-
726
- def semantic_search(query, embeddings, model, threshold=0.5):
727
- query_embedding = model.encode([query])
728
- sims = cosine_similarity(query_embedding, embeddings)[0]
729
- return [(i, s) for i, s in enumerate(sims) if s > threshold]
730
-
731
- def rag_summarize(texts, summarizer, top_k=5):
732
- if not texts:
733
- return "No relevant content to summarize."
734
- vect = TfidfVectorizer()
735
- m = vect.fit_transform(texts)
736
- mean_vec = m.mean(axis=0).A
737
- scores = cosine_similarity(mean_vec, m).flatten()
738
- top_indices = scores.argsort()[::-1][:top_k]
739
- ctx = "\n".join(texts[i] for i in top_indices)
740
- prompt = "summarize: " + ctx[:1024]
741
- out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
742
- return out[0]['summary_text']
743
-
744
- def extract_month_year(q):
745
- month_map = {m: i for i, m in enumerate(
746
- ["january", "february", "march", "april", "may", "june",
747
- "july", "august", "september", "october", "november", "december"], 1)}
748
- ql = q.lower()
749
- mon = next((v for k, v in month_map.items() if k in ql), None)
750
- ym = re.search(r"(19|20)\d{2}", q)
751
- yr = int(ym.group()) if ym else None
752
- return mon, yr
753
-
754
- def extract_date_range(query):
755
- month_map = {
756
- "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
757
- "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
758
- }
759
-
760
- patterns = [
761
- r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
762
- ]
763
-
764
- for pattern in patterns:
765
- match = re.search(pattern, query)
766
- if match:
767
- start_month_str, start_year = match.group(1).lower(), int(match.group(2))
768
- end_month_str, end_year = match.group(3).lower(), int(match.group(4))
769
-
770
- start_month = month_map.get(start_month_str)
771
- end_month = month_map.get(end_month_str)
772
-
773
- if start_month and end_month:
774
- start_date = datetime(start_year, start_month, 1)
775
- end_date = datetime(end_year, end_month, 28)
776
- return start_date, end_date
777
-
778
- return None, None
779
-
780
-
781
- def extract_topic_match(query, df):
782
- query_lower = query.lower()
783
- return df[
784
- df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
785
- df['Intent'].fillna('').str.lower().str.contains(query_lower) |
786
- df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
787
- df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
788
- df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
789
- df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
790
- ]
791
-
792
-
793
- st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
794
- st.title("Illinois Legislative Trends Q&A")
795
- st.markdown("Ask about trends in topics like higher education, funding, etc.")
796
-
797
- df = load_data()
798
- embed_model, summarizer = load_models()
799
-
800
- query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
801
-
802
- if query:
803
- start_date, end_date = extract_date_range(query)
804
- df2 = extract_topic_match(query, df)
805
-
806
- if df2.empty:
807
- df2 = df
808
-
809
- if start_date and end_date:
810
- df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
811
- st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
812
- else:
813
- mon, yr = extract_month_year(query)
814
- if yr:
815
- df2 = df2[df2['status_date'].dt.year == yr]
816
- if mon:
817
- df2 = df2[df2['status_date'].dt.month == mon]
818
- st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
819
- else:
820
- st.info(f"Filtering by year: **{yr}**")
821
-
822
- if df2.empty:
823
- st.warning("No matching records found.")
824
- else:
825
- texts = df2['combined_text'].tolist()
826
- embs = compute_embeddings(texts, _model=embed_model)
827
- res = semantic_search(query, embs, embed_model, threshold=0.5)
828
-
829
- if not res:
830
- st.warning("No relevant insights found.")
831
- else:
832
- st.subheader("Top Matching Insights")
833
- collected = []
834
-
835
- for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
836
- row = df2.iloc[idx]
837
- date = row['status_date'].date()
838
- bill_number = row['bill_number']
839
- full_url = row['url']
840
- cat = row.get('Category & Subcategory', '')
841
- bene = row.get('Intended Beneficiaries', '')
842
- goal = row.get('Legislative Goal', '')
843
- impact = row.get('Policy Impact Areas', '')
844
- provision = row.get('Key Provisions', '')
845
- intent = row.get('Intent', '')
846
- stance = row.get('Stance', '')
847
- description = row.get('description', '')
848
-
849
- st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
850
- st.markdown(f"**Category:** {cat}")
851
- st.markdown(f"**Intended Beneficiaries:** {bene}")
852
- st.markdown(f"**Goal:** {goal}")
853
- st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
854
- st.markdown(f"**Policy Impact Area:** {impact}")
855
- st.markdown(f"**Key Provision:** {provision}")
856
- st.markdown(f"**Description:** {description}")
857
- st.markdown(f"[View Full Bill Text]({full_url})\n")
858
- st.divider()
859
-
860
- collected.append(row['combined_text'])
861
-
862
- st.subheader("RAG-Generated Overall Summary")
863
- summary = rag_summarize(collected, summarizer)
864
- st.success(summary)
865
 
866
 
867
 
 
674
 
675
 
676
  #BART
677
+ import streamlit as st
678
+ import pandas as pd
679
+ import re
680
+ from sentence_transformers import SentenceTransformer
681
+ from transformers import pipeline
682
+ from sklearn.metrics.pairwise import cosine_similarity
683
+ from sklearn.feature_extraction.text import TfidfVectorizer
684
+ from datetime import datetime
685
+
686
+ def clean_text(text):
687
+ text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
688
+ text = re.sub(r"(?i)let me know if you'd like.*", "", text)
689
+ text = re.sub(r"(?i)trend summary[:]*", "", text)
690
+ text = re.sub(r"(?i)actionable insight[:]*", "", text)
691
+ return text.strip()
692
+
693
+ @st.cache_data
694
+ def load_data():
695
+ df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
696
+ df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
697
+ df = df.dropna(subset=['status_date'])
698
+
699
+ for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
700
+ "Intended Beneficiaries", "Potential Impact", "description"]:
701
+ df[col] = df[col].fillna("")
702
+
703
+ df["combined_text"] = (
704
+ "Legislative Goal: " + df["Legislative Goal"] + "\n" +
705
+ "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
706
+ "Key Provisions: " + df["Key Provisions"] + "\n" +
707
+ "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
708
+ "Potential Impact: " + df["Potential Impact"] + "\n" +
709
+ "Description: " + df["description"]
710
+ )
711
+
712
+ return df
713
+
714
+ @st.cache_resource
715
+ def load_models():
716
+ embed_model = SentenceTransformer('all-MiniLM-L6-v2')
717
+ # Changed summarization model to facebook/bart-large-cnn for better summary quality
718
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
719
+ return embed_model, summarizer
720
+
721
+ @st.cache_data
722
+ def compute_embeddings(texts, _model):
723
+ return _model.encode(texts, show_progress_bar=True)
724
+
725
+ def semantic_search(query, embeddings, model, threshold=0.5):
726
+ query_embedding = model.encode([query])
727
+ sims = cosine_similarity(query_embedding, embeddings)[0]
728
+ return [(i, s) for i, s in enumerate(sims) if s > threshold]
729
+
730
+ def rag_summarize(texts, summarizer, top_k=5):
731
+ if not texts:
732
+ return "No relevant content to summarize."
733
+ vect = TfidfVectorizer()
734
+ m = vect.fit_transform(texts)
735
+ mean_vec = m.mean(axis=0).A
736
+ scores = cosine_similarity(mean_vec, m).flatten()
737
+ top_indices = scores.argsort()[::-1][:top_k]
738
+ ctx = "\n".join(texts[i] for i in top_indices)
739
+ prompt = "summarize: " + ctx[:1024]
740
+ out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
741
+ return out[0]['summary_text']
742
+
743
+ def extract_month_year(q):
744
+ month_map = {m: i for i, m in enumerate(
745
+ ["january", "february", "march", "april", "may", "june",
746
+ "july", "august", "september", "october", "november", "december"], 1)}
747
+ ql = q.lower()
748
+ mon = next((v for k, v in month_map.items() if k in ql), None)
749
+ ym = re.search(r"(19|20)\d{2}", q)
750
+ yr = int(ym.group()) if ym else None
751
+ return mon, yr
752
+
753
+ def extract_date_range(query):
754
+ month_map = {
755
+ "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
756
+ "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
757
+ }
758
+
759
+ patterns = [
760
+ r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
761
+ ]
762
+
763
+ for pattern in patterns:
764
+ match = re.search(pattern, query)
765
+ if match:
766
+ start_month_str, start_year = match.group(1).lower(), int(match.group(2))
767
+ end_month_str, end_year = match.group(3).lower(), int(match.group(4))
768
+
769
+ start_month = month_map.get(start_month_str)
770
+ end_month = month_map.get(end_month_str)
771
+
772
+ if start_month and end_month:
773
+ start_date = datetime(start_year, start_month, 1)
774
+ end_date = datetime(end_year, end_month, 28)
775
+ return start_date, end_date
776
+
777
+ return None, None
778
+
779
+
780
+ def extract_topic_match(query, df):
781
+ query_lower = query.lower()
782
+ return df[
783
+ df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
784
+ df['Intent'].fillna('').str.lower().str.contains(query_lower) |
785
+ df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
786
+ df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
787
+ df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
788
+ df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
789
+ ]
790
+
791
+
792
+ st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
793
+ st.title("Illinois Legislative Trends Q&A")
794
+ st.markdown("Ask about trends in topics like higher education, funding, etc.")
795
+
796
+ df = load_data()
797
+ embed_model, summarizer = load_models()
798
+
799
+ query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025���):")
800
+
801
+ if query:
802
+ start_date, end_date = extract_date_range(query)
803
+ df2 = extract_topic_match(query, df)
804
+
805
+ if df2.empty:
806
+ df2 = df
807
+
808
+ if start_date and end_date:
809
+ df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
810
+ st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
811
+ else:
812
+ mon, yr = extract_month_year(query)
813
+ if yr:
814
+ df2 = df2[df2['status_date'].dt.year == yr]
815
+ if mon:
816
+ df2 = df2[df2['status_date'].dt.month == mon]
817
+ st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
818
+ else:
819
+ st.info(f"Filtering by year: **{yr}**")
820
+
821
+ if df2.empty:
822
+ st.warning("No matching records found.")
823
+ else:
824
+ texts = df2['combined_text'].tolist()
825
+ embs = compute_embeddings(texts, _model=embed_model)
826
+ res = semantic_search(query, embs, embed_model, threshold=0.5)
827
+
828
+ if not res:
829
+ st.warning("No relevant insights found.")
830
+ else:
831
+ st.subheader("Top Matching Insights")
832
+ collected = []
833
+
834
+ for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
835
+ row = df2.iloc[idx]
836
+ date = row['status_date'].date()
837
+ bill_number = row['bill_number']
838
+ full_url = row['url']
839
+ cat = row.get('Category & Subcategory', '')
840
+ bene = row.get('Intended Beneficiaries', '')
841
+ goal = row.get('Legislative Goal', '')
842
+ impact = row.get('Policy Impact Areas', '')
843
+ provision = row.get('Key Provisions', '')
844
+ intent = row.get('Intent', '')
845
+ stance = row.get('Stance', '')
846
+ description = row.get('description', '')
847
+
848
+ st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
849
+ st.markdown(f"**Category:** {cat}")
850
+ st.markdown(f"**Intended Beneficiaries:** {bene}")
851
+ st.markdown(f"**Goal:** {goal}")
852
+ st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
853
+ st.markdown(f"**Policy Impact Area:** {impact}")
854
+ st.markdown(f"**Key Provision:** {provision}")
855
+ st.markdown(f"**Description:** {description}")
856
+ st.markdown(f"[View Full Bill Text]({full_url})\n")
857
+ st.divider()
858
+
859
+ collected.append(row['combined_text'])
860
+
861
+ st.subheader("RAG-Generated Overall Summary")
862
+ summary = rag_summarize(collected, summarizer)
863
+ st.success(summary)
864
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
865
 
866
 
867