# import streamlit as st # import pandas as pd # import re # from sentence_transformers import SentenceTransformer # from transformers import pipeline # from sklearn.metrics.pairwise import cosine_similarity # from sklearn.feature_extraction.text import TfidfVectorizer # from datetime import datetime # def clean_text(text): # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) # text = re.sub(r"(?i)let me know if you'd like.*", "", text) # text = re.sub(r"(?i)trend summary[:]*", "", text) # text = re.sub(r"(?i)actionable insight[:]*", "", text) # return text.strip() # @st.cache_data # def load_data(): # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv") # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') # df = df.dropna(subset=['status_date']) # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("") # df["llama_insight"] = df["llama_insight"].fillna("") # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"] # return df # @st.cache_resource # def load_models(): # embed_model = SentenceTransformer('all-MiniLM-L6-v2') # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") # return embed_model, summarizer # @st.cache_data # def compute_embeddings(texts, _model): # return _model.encode(texts, show_progress_bar=True) # def semantic_search(query, embeddings, model, threshold=0.5): # query_embedding = model.encode([query]) # sims = cosine_similarity(query_embedding, embeddings)[0] # return [(i, s) for i, s in enumerate(sims) if s > threshold] # def rag_summarize(texts, summarizer, top_k=5): # if not texts: # return "No relevant content to summarize." # vect = TfidfVectorizer() # m = vect.fit_transform(texts) # mean_vec = m.mean(axis=0).A # scores = cosine_similarity(mean_vec, m).flatten() # top_indices = scores.argsort()[::-1][:top_k] # ctx = "\n".join(texts[i] for i in top_indices) # prompt = "summarize: " + ctx[:1024] # out = summarizer(prompt, max_length=60, min_length=30, do_sample=False) # return out[0]['summary_text'] # def extract_month_year(q): # month_map = {m: i for i, m in enumerate( # ["january", "february", "march", "april", "may", "june", # "july", "august", "september", "october", "november", "december"], 1)} # ql = q.lower() # mon = next((v for k, v in month_map.items() if k in ql), None) # ym = re.search(r"(19|20)\d{2}", q) # yr = int(ym.group()) if ym else None # return mon, yr # def extract_topic_match(query, df): # query_lower = query.lower() # return df[ # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) | # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) | # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) | # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower) # ] # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") # st.title("Illinois Legislative Trends Q&A") # st.markdown("Ask about trends in topics like higher education, funding, etc.") # df = load_data() # embed_model, summarizer = load_models() # query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):") # if query: # mon, yr = extract_month_year(query) # df2 = extract_topic_match(query, df) # if df2.empty: # df2 = df # if yr: # df2 = df2[df2['status_date'].dt.year == yr] # if mon: # df2 = df2[df2['status_date'].dt.month == mon] # st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") # else: # st.info(f" Filtering by year: **{yr}**") # if df2.empty: # st.warning("No matching records found.") # else: # texts = df2['summary_insight'].tolist() # embs = compute_embeddings(texts, _model=embed_model) # res = semantic_search(query, embs, embed_model, threshold=0.5) # if not res: # st.warning("No relevant insights found.") # else: # st.subheader(" Top Matching Insights") # collected = [] # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]: # row = df2.iloc[idx] # date = row['status_date'].date() # bill_number = row['bill_number'] # full_url = row['url'] # cat = row['Category & Subcategory'] # cat_std = row['category_&_subcategory_standardized2'] # bene= row['Intended Beneficiaries'] # bene_std= row['intended_beneficiaries_standardized2'] # goal = row['Legislative Goal'] # impact = row['Policy Impact Areas'] # provision = row['Key Provisions'] # intent = row['Intent'] # stance = row['Stance'] # description = row['description'] # summary = row['summary'] # trend = clean_text(row['llama_trend_summary']) # insight = clean_text(row['llama_insight']) # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") # st.markdown(f"**Category:** {cat}") # st.markdown(f"**Category Std:** {cat_std}") # st.markdown(f"** Intended Beneficiaries:** {bene}") # st.markdown(f"** Intended Beneficiaries STD:** {bene_std}") # st.markdown(f"**Goal:** {goal}") # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") # st.markdown(f"**Policy Impacy Area:** {impact}") # st.markdown(f"**Key Provision:** {provision}") # st.markdown(f"**Description:** {description}") # st.markdown(f"**Summary:** {summary}") # st.markdown(f"Trend Summary:{trend}") # st.markdown(f"Actionable Insight:{insight}") # st.markdown(f"[View Full Bill Text]({full_url})\n") # st.divider() # collected.append(row['summary_insight']) # st.subheader("RAG-Generated Overall Summary") # summary = rag_summarize(collected, summarizer) # st.success(summary) # import streamlit as st # import pandas as pd # import re # from sentence_transformers import SentenceTransformer # from transformers import pipeline # from sklearn.metrics.pairwise import cosine_similarity # from sklearn.feature_extraction.text import TfidfVectorizer # from datetime import datetime # def clean_text(text): # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) # text = re.sub(r"(?i)let me know if you'd like.*", "", text) # text = re.sub(r"(?i)trend summary[:]*", "", text) # text = re.sub(r"(?i)actionable insight[:]*", "", text) # return text.strip() # @st.cache_data # def load_data(): # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv") # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') # df = df.dropna(subset=['status_date']) # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("") # df["llama_insight"] = df["llama_insight"].fillna("") # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"] # return df # @st.cache_resource # def load_models(): # embed_model = SentenceTransformer('all-MiniLM-L6-v2') # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") # return embed_model, summarizer # @st.cache_data # def compute_embeddings(texts, _model): # return _model.encode(texts, show_progress_bar=True) # def semantic_search(query, embeddings, model, threshold=0.5): # query_embedding = model.encode([query]) # sims = cosine_similarity(query_embedding, embeddings)[0] # return [(i, s) for i, s in enumerate(sims) if s > threshold] # def rag_summarize(texts, summarizer, top_k=10): # increased from 5 to 10 # if not texts: # return "No relevant content to summarize." # vect = TfidfVectorizer() # m = vect.fit_transform(texts) # mean_vec = m.mean(axis=0).A # scores = cosine_similarity(mean_vec, m).flatten() # top_indices = scores.argsort()[::-1][:top_k] # ctx = "\n".join(texts[i] for i in top_indices) # prompt = "summarize: " + ctx[:1024] # out = summarizer(prompt, max_length=150, min_length=80, do_sample=False) # updated length # return out[0]['summary_text'] # def extract_month_year(q): # month_map = {m: i for i, m in enumerate( # ["january", "february", "march", "april", "may", "june", # "july", "august", "september", "october", "november", "december"], 1)} # ql = q.lower() # mon = next((v for k, v in month_map.items() if k in ql), None) # ym = re.search(r"(19|20)\d{2}", q) # yr = int(ym.group()) if ym else None # return mon, yr # def extract_topic_match(query, df): # query_lower = query.lower() # return df[ # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) | # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) | # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) | # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower) # ] # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") # st.title("Illinois Legislative Trends Q&A") # st.markdown("Ask about trends in topics like higher education, funding, etc.") # df = load_data() # embed_model, summarizer = load_models() # query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):") # if query: # mon, yr = extract_month_year(query) # df2 = extract_topic_match(query, df) # if df2.empty: # df2 = df # if yr: # df2 = df2[df2['status_date'].dt.year == yr] # if mon: # df2 = df2[df2['status_date'].dt.month == mon] # st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") # else: # st.info(f" Filtering by year: **{yr}**") # if df2.empty: # st.warning("No matching records found.") # else: # texts = df2['summary_insight'].tolist() # embs = compute_embeddings(texts, _model=embed_model) # res = semantic_search(query, embs, embed_model, threshold=0.5) # if not res: # st.warning("No relevant insights found.") # else: # st.subheader("Top Matching Insights") # collected = [] # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # increased to 10 # row = df2.iloc[idx] # date = row['status_date'].date() # bill_number = row['bill_number'] # full_url = row['url'] # cat = row['Category & Subcategory'] # cat_std = row['category_&_subcategory_standardized2'] # bene= row['Intended Beneficiaries'] # bene_std= row['intended_beneficiaries_standardized2'] # goal = row['Legislative Goal'] # impact = row['Policy Impact Areas'] # provision = row['Key Provisions'] # intent = row['Intent'] # stance = row['Stance'] # description = row['description'] # summary = row['summary'] # trend = clean_text(row['llama_trend_summary']) # insight = clean_text(row['llama_insight']) # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") # st.markdown(f"**Category:** {cat}") # # st.markdown(f"**Category Std:** {cat_std}") # st.markdown(f"**Intended Beneficiaries:** {bene}") # # st.markdown(f"**Intended Beneficiaries STD:** {bene_std}") # st.markdown(f"**Goal:** {goal}") # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") # st.markdown(f"**Policy Impact Area:** {impact}") # st.markdown(f"**Key Provision:** {provision}") # st.markdown(f"**Description:** {description}") # # st.markdown(f"**Summary:** {summary}") # st.markdown(f"**Trend Summary:** {trend}") # st.markdown(f"**Actionable Insight:** {insight}") # st.markdown(f"[View Full Bill Text]({full_url})\n") # st.divider() # collected.append(row['summary_insight']) # st.subheader("RAG-Generated Overall Summary") # summary = rag_summarize(collected, summarizer) # st.success(summary) # # including description # import streamlit as st # import pandas as pd # import re # from sentence_transformers import SentenceTransformer # from transformers import pipeline # from sklearn.metrics.pairwise import cosine_similarity # from sklearn.feature_extraction.text import TfidfVectorizer # from datetime import datetime # def clean_text(text): # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) # text = re.sub(r"(?i)let me know if you'd like.*", "", text) # text = re.sub(r"(?i)trend summary[:]*", "", text) # text = re.sub(r"(?i)actionable insight[:]*", "", text) # return text.strip() # @st.cache_data # def load_data(): # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv") # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') # df = df.dropna(subset=['status_date']) # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("") # df["llama_insight"] = df["llama_insight"].fillna("") # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"] # return df # @st.cache_resource # def load_models(): # embed_model = SentenceTransformer('all-MiniLM-L6-v2') # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") # return embed_model, summarizer # @st.cache_data # def compute_embeddings(texts, _model): # return _model.encode(texts, show_progress_bar=True) # def semantic_search(query, embeddings, model, threshold=0.5): # query_embedding = model.encode([query]) # sims = cosine_similarity(query_embedding, embeddings)[0] # return [(i, s) for i, s in enumerate(sims) if s > threshold] # def rag_summarize(texts, summarizer, top_k=10): # if not texts: # return "No relevant content to summarize." # vect = TfidfVectorizer() # m = vect.fit_transform(texts) # mean_vec = m.mean(axis=0).A # scores = cosine_similarity(mean_vec, m).flatten() # top_indices = scores.argsort()[::-1][:top_k] # ctx = "\n".join(texts[i] for i in top_indices) # prompt = "summarize: " + ctx[:1024] # out = summarizer(prompt, max_length=200, min_length=80, do_sample=False) # return out[0]['summary_text'] # def extract_month_year(q): # month_map = {m: i for i, m in enumerate( # ["january", "february", "march", "april", "may", "june", # "july", "august", "september", "october", "november", "december"], 1)} # ql = q.lower() # mon = next((v for k, v in month_map.items() if k in ql), None) # ym = re.search(r"(19|20)\d{2}", q) # yr = int(ym.group()) if ym else None # return mon, yr # def extract_date_range(query): # month_map = { # "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, # "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12 # } # patterns = [ # r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})", # ] # for pattern in patterns: # match = re.search(pattern, query) # if match: # start_month_str, start_year = match.group(1).lower(), int(match.group(2)) # end_month_str, end_year = match.group(3).lower(), int(match.group(4)) # start_month = month_map.get(start_month_str) # end_month = month_map.get(end_month_str) # if start_month and end_month: # start_date = datetime(start_year, start_month, 1) # end_date = datetime(end_year, end_month, 28) # return start_date, end_date # return None, None # def extract_topic_match(query, df): # query_lower = query.lower() # return df[ # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) | # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) | # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) | # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower) # ] # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") # st.title("Illinois Legislative Trends Q&A") # st.markdown("Ask about trends in topics like higher education, funding, etc.") # df = load_data() # embed_model, summarizer = load_models() # query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):") # if query: # start_date, end_date = extract_date_range(query) # df2 = extract_topic_match(query, df) # if df2.empty: # df2 = df # if start_date and end_date: # df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)] # st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**") # else: # mon, yr = extract_month_year(query) # if yr: # df2 = df2[df2['status_date'].dt.year == yr] # if mon: # df2 = df2[df2['status_date'].dt.month == mon] # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") # else: # st.info(f"Filtering by year: **{yr}**") # if df2.empty: # st.warning("No matching records found.") # else: # # Include description in embeddings + RAG # texts = (df2['description'].fillna('') + "\n" + df2['summary_insight'].fillna('')).tolist() # embs = compute_embeddings(texts, _model=embed_model) # res = semantic_search(query, embs, embed_model, threshold=0.5) # if not res: # st.warning("No relevant insights found.") # else: # st.subheader("Top Matching Insights") # collected = [] # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # row = df2.iloc[idx] # date = row['status_date'].date() # bill_number = row['bill_number'] # full_url = row['url'] # cat = row['Category & Subcategory'] # cat_std = row['category_&_subcategory_standardized2'] # bene = row['Intended Beneficiaries'] # bene_std = row['intended_beneficiaries_standardized2'] # goal = row['Legislative Goal'] # impact = row['Policy Impact Areas'] # provision = row['Key Provisions'] # intent = row['Intent'] # stance = row['Stance'] # description = row['description'] # summary = row['summary'] # trend = clean_text(row['llama_trend_summary']) # insight = clean_text(row['llama_insight']) # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") # st.markdown(f"**Category:** {cat}") # st.markdown(f"**Intended Beneficiaries:** {bene}") # st.markdown(f"**Goal:** {goal}") # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") # st.markdown(f"**Policy Impact Area:** {impact}") # st.markdown(f"**Key Provision:** {provision}") # st.markdown(f"**Description:** {description}") # st.markdown(f"**Trend Summary:** {trend}") # st.markdown(f"**Actionable Insight:** {insight}") # st.markdown(f"[View Full Bill Text]({full_url})\n") # st.divider() # collected.append(description + "\n" + row['summary_insight']) # st.subheader("RAG-Generated Overall Summary") # summary = rag_summarize(collected, summarizer) # st.success(summary) ## NEW ONE # import streamlit as st # import pandas as pd # import re # from sentence_transformers import SentenceTransformer # from transformers import pipeline # from sklearn.metrics.pairwise import cosine_similarity # from sklearn.feature_extraction.text import TfidfVectorizer # from datetime import datetime # def clean_text(text): # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) # text = re.sub(r"(?i)let me know if you'd like.*", "", text) # text = re.sub(r"(?i)trend summary[:]*", "", text) # text = re.sub(r"(?i)actionable insight[:]*", "", text) # return text.strip() # @st.cache_data # def load_data(): # df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv") # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') # df = df.dropna(subset=['status_date']) # for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions", # "Intended Beneficiaries", "Potential Impact", "description"]: # df[col] = df[col].fillna("") # df["combined_text"] = ( # "Legislative Goal: " + df["Legislative Goal"] + "\n" + # "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" + # "Key Provisions: " + df["Key Provisions"] + "\n" + # "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" + # "Potential Impact: " + df["Potential Impact"] + "\n" + # "Description: " + df["description"] # ) # return df # @st.cache_resource # def load_models(): # embed_model = SentenceTransformer('all-MiniLM-L6-v2') # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") # return embed_model, summarizer # @st.cache_data # def compute_embeddings(texts, _model): # return _model.encode(texts, show_progress_bar=True) # def semantic_search(query, embeddings, model, threshold=0.5): # query_embedding = model.encode([query]) # sims = cosine_similarity(query_embedding, embeddings)[0] # return [(i, s) for i, s in enumerate(sims) if s > threshold] # def rag_summarize(texts, summarizer, top_k=5): # if not texts: # return "No relevant content to summarize." # vect = TfidfVectorizer() # m = vect.fit_transform(texts) # mean_vec = m.mean(axis=0).A # scores = cosine_similarity(mean_vec, m).flatten() # top_indices = scores.argsort()[::-1][:top_k] # ctx = "\n".join(texts[i] for i in top_indices) # prompt = "summarize: " + ctx[:1024] # out = summarizer(prompt, max_length=200, min_length=80, do_sample=False) # return out[0]['summary_text'] # def extract_month_year(q): # month_map = {m: i for i, m in enumerate( # ["january", "february", "march", "april", "may", "june", # "july", "august", "september", "october", "november", "december"], 1)} # ql = q.lower() # mon = next((v for k, v in month_map.items() if k in ql), None) # ym = re.search(r"(19|20)\d{2}", q) # yr = int(ym.group()) if ym else None # return mon, yr # def extract_date_range(query): # month_map = { # "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, # "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12 # } # patterns = [ # r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})", # ] # for pattern in patterns: # match = re.search(pattern, query) # if match: # start_month_str, start_year = match.group(1).lower(), int(match.group(2)) # end_month_str, end_year = match.group(3).lower(), int(match.group(4)) # start_month = month_map.get(start_month_str) # end_month = month_map.get(end_month_str) # if start_month and end_month: # start_date = datetime(start_year, start_month, 1) # end_date = datetime(end_year, end_month, 28) # return start_date, end_date # return None, None # def extract_topic_match(query, df): # query_lower = query.lower() # return df[ # df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) | # df['Intent'].fillna('').str.lower().str.contains(query_lower) | # df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) | # df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) | # df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) | # df['Potential Impact'].fillna('').str.lower().str.contains(query_lower) # ] # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") # st.title("Illinois Legislative Trends Q&A") # st.markdown("Ask about trends in topics like higher education, funding, etc.") # df = load_data() # embed_model, summarizer = load_models() # query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):") # if query: # start_date, end_date = extract_date_range(query) # df2 = extract_topic_match(query, df) # if df2.empty: # df2 = df # if start_date and end_date: # df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)] # st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**") # else: # mon, yr = extract_month_year(query) # if yr: # df2 = df2[df2['status_date'].dt.year == yr] # if mon: # df2 = df2[df2['status_date'].dt.month == mon] # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") # else: # st.info(f"Filtering by year: **{yr}**") # if df2.empty: # st.warning("No matching records found.") # else: # texts = df2['combined_text'].tolist() # embs = compute_embeddings(texts, _model=embed_model) # res = semantic_search(query, embs, embed_model, threshold=0.5) # if not res: # st.warning("No relevant insights found.") # else: # st.subheader("Top Matching Insights") # collected = [] # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # row = df2.iloc[idx] # date = row['status_date'].date() # bill_number = row['bill_number'] # full_url = row['url'] # cat = row.get('Category & Subcategory', '') # bene = row.get('Intended Beneficiaries', '') # goal = row.get('Legislative Goal', '') # impact = row.get('Policy Impact Areas', '') # provision = row.get('Key Provisions', '') # intent = row.get('Intent', '') # stance = row.get('Stance', '') # description = row.get('description', '') # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") # st.markdown(f"**Category:** {cat}") # st.markdown(f"**Intended Beneficiaries:** {bene}") # st.markdown(f"**Goal:** {goal}") # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") # st.markdown(f"**Policy Impact Area:** {impact}") # st.markdown(f"**Key Provision:** {provision}") # st.markdown(f"**Description:** {description}") # st.markdown(f"[View Full Bill Text]({full_url})\n") # st.divider() # collected.append(row['combined_text']) # st.subheader("RAG-Generated Overall Summary") # summary = rag_summarize(collected, summarizer) # st.success(summary) #BART import streamlit as st import pandas as pd import re from sentence_transformers import SentenceTransformer from transformers import pipeline from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer from datetime import datetime def clean_text(text): text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) text = re.sub(r"(?i)let me know if you'd like.*", "", text) text = re.sub(r"(?i)trend summary[:]*", "", text) text = re.sub(r"(?i)actionable insight[:]*", "", text) return text.strip() @st.cache_data def load_data(): df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv") df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') df = df.dropna(subset=['status_date']) for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions", "Intended Beneficiaries", "Potential Impact", "description"]: df[col] = df[col].fillna("") df["combined_text"] = ( "Legislative Goal: " + df["Legislative Goal"] + "\n" + "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" + "Key Provisions: " + df["Key Provisions"] + "\n" + "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" + "Potential Impact: " + df["Potential Impact"] + "\n" + "Description: " + df["description"] ) return df @st.cache_resource def load_models(): embed_model = SentenceTransformer('all-MiniLM-L6-v2') # Changed summarization model to facebook/bart-large-cnn for better summary quality summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn") return embed_model, summarizer @st.cache_data def compute_embeddings(texts, _model): return _model.encode(texts, show_progress_bar=True) def semantic_search(query, embeddings, model, threshold=0.5): query_embedding = model.encode([query]) sims = cosine_similarity(query_embedding, embeddings)[0] return [(i, s) for i, s in enumerate(sims) if s > threshold] def rag_summarize(texts, summarizer, top_k=5): if not texts: return "No relevant content to summarize." vect = TfidfVectorizer() m = vect.fit_transform(texts) mean_vec = m.mean(axis=0).A scores = cosine_similarity(mean_vec, m).flatten() top_indices = scores.argsort()[::-1][:top_k] ctx = "\n".join(texts[i] for i in top_indices) prompt = "summarize: " + ctx[:1024] out = summarizer(prompt, max_length=250, min_length=80, do_sample=False) return out[0]['summary_text'] def extract_month_year(q): month_map = {m: i for i, m in enumerate( ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"], 1)} ql = q.lower() mon = next((v for k, v in month_map.items() if k in ql), None) ym = re.search(r"(19|20)\d{2}", q) yr = int(ym.group()) if ym else None return mon, yr def extract_date_range(query): month_map = { "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12 } patterns = [ r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})", ] for pattern in patterns: match = re.search(pattern, query) if match: start_month_str, start_year = match.group(1).lower(), int(match.group(2)) end_month_str, end_year = match.group(3).lower(), int(match.group(4)) start_month = month_map.get(start_month_str) end_month = month_map.get(end_month_str) if start_month and end_month: start_date = datetime(start_year, start_month, 1) end_date = datetime(end_year, end_month, 28) return start_date, end_date return None, None def extract_topic_match(query, df): query_lower = query.lower() return df[ df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) | df['Intent'].fillna('').str.lower().str.contains(query_lower) | df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) | df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) | df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) | df['Potential Impact'].fillna('').str.lower().str.contains(query_lower) ] st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") st.title("Illinois Legislative Trends Q&A") st.markdown("Ask about trends in topics like higher education, funding, etc.") df = load_data() embed_model, summarizer = load_models() query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):") if query: start_date, end_date = extract_date_range(query) df2 = extract_topic_match(query, df) if df2.empty: df2 = df if start_date and end_date: df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)] st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**") else: mon, yr = extract_month_year(query) if yr: df2 = df2[df2['status_date'].dt.year == yr] if mon: df2 = df2[df2['status_date'].dt.month == mon] st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") else: st.info(f"Filtering by year: **{yr}**") if df2.empty: st.warning("No matching records found.") else: texts = df2['combined_text'].tolist() embs = compute_embeddings(texts, _model=embed_model) res = semantic_search(query, embs, embed_model, threshold=0.5) if not res: st.warning("No relevant insights found.") else: st.subheader("Top Matching Insights") collected = [] for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: row = df2.iloc[idx] date = row['status_date'].date() bill_number = row['bill_number'] full_url = row['url'] cat = row.get('Category & Subcategory', '') bene = row.get('Intended Beneficiaries', '') goal = row.get('Legislative Goal', '') impact = row.get('Policy Impact Areas', '') provision = row.get('Key Provisions', '') intent = row.get('Intent', '') stance = row.get('Stance', '') description = row.get('description', '') st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") st.markdown(f"**Category:** {cat}") st.markdown(f"**Intended Beneficiaries:** {bene}") st.markdown(f"**Goal:** {goal}") st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") st.markdown(f"**Policy Impact Area:** {impact}") st.markdown(f"**Key Provision:** {provision}") st.markdown(f"**Description:** {description}") st.markdown(f"[View Full Bill Text]({full_url})\n") st.divider() collected.append(row['combined_text']) st.subheader("RAG-Generated Overall Summary") summary = rag_summarize(collected, summarizer) st.success(summary)