RAG / app.py
tjl8's picture
Update app.py
9e33377 verified
# import streamlit as st
# import pandas as pd
# import re
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer
# from datetime import datetime
# def clean_text(text):
# text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
# text = re.sub(r"(?i)let me know if you'd like.*", "", text)
# text = re.sub(r"(?i)trend summary[:]*", "", text)
# text = re.sub(r"(?i)actionable insight[:]*", "", text)
# return text.strip()
# @st.cache_data
# def load_data():
# df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
# df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
# df = df.dropna(subset=['status_date'])
# df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
# df["llama_insight"] = df["llama_insight"].fillna("")
# df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
# return df
# @st.cache_resource
# def load_models():
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
# return embed_model, summarizer
# @st.cache_data
# def compute_embeddings(texts, _model):
# return _model.encode(texts, show_progress_bar=True)
# def semantic_search(query, embeddings, model, threshold=0.5):
# query_embedding = model.encode([query])
# sims = cosine_similarity(query_embedding, embeddings)[0]
# return [(i, s) for i, s in enumerate(sims) if s > threshold]
# def rag_summarize(texts, summarizer, top_k=5):
# if not texts:
# return "No relevant content to summarize."
# vect = TfidfVectorizer()
# m = vect.fit_transform(texts)
# mean_vec = m.mean(axis=0).A
# scores = cosine_similarity(mean_vec, m).flatten()
# top_indices = scores.argsort()[::-1][:top_k]
# ctx = "\n".join(texts[i] for i in top_indices)
# prompt = "summarize: " + ctx[:1024]
# out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
# return out[0]['summary_text']
# def extract_month_year(q):
# month_map = {m: i for i, m in enumerate(
# ["january", "february", "march", "april", "may", "june",
# "july", "august", "september", "october", "november", "december"], 1)}
# ql = q.lower()
# mon = next((v for k, v in month_map.items() if k in ql), None)
# ym = re.search(r"(19|20)\d{2}", q)
# yr = int(ym.group()) if ym else None
# return mon, yr
# def extract_topic_match(query, df):
# query_lower = query.lower()
# return df[
# df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
# df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
# df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
# df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
# ]
# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
# st.title("Illinois Legislative Trends Q&A")
# st.markdown("Ask about trends in topics like higher education, funding, etc.")
# df = load_data()
# embed_model, summarizer = load_models()
# query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
# if query:
# mon, yr = extract_month_year(query)
# df2 = extract_topic_match(query, df)
# if df2.empty:
# df2 = df
# if yr:
# df2 = df2[df2['status_date'].dt.year == yr]
# if mon:
# df2 = df2[df2['status_date'].dt.month == mon]
# st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
# else:
# st.info(f" Filtering by year: **{yr}**")
# if df2.empty:
# st.warning("No matching records found.")
# else:
# texts = df2['summary_insight'].tolist()
# embs = compute_embeddings(texts, _model=embed_model)
# res = semantic_search(query, embs, embed_model, threshold=0.5)
# if not res:
# st.warning("No relevant insights found.")
# else:
# st.subheader(" Top Matching Insights")
# collected = []
# for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
# row = df2.iloc[idx]
# date = row['status_date'].date()
# bill_number = row['bill_number']
# full_url = row['url']
# cat = row['Category & Subcategory']
# cat_std = row['category_&_subcategory_standardized2']
# bene= row['Intended Beneficiaries']
# bene_std= row['intended_beneficiaries_standardized2']
# goal = row['Legislative Goal']
# impact = row['Policy Impact Areas']
# provision = row['Key Provisions']
# intent = row['Intent']
# stance = row['Stance']
# description = row['description']
# summary = row['summary']
# trend = clean_text(row['llama_trend_summary'])
# insight = clean_text(row['llama_insight'])
# st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
# st.markdown(f"**Category:** {cat}")
# st.markdown(f"**Category Std:** {cat_std}")
# st.markdown(f"** Intended Beneficiaries:** {bene}")
# st.markdown(f"** Intended Beneficiaries STD:** {bene_std}")
# st.markdown(f"**Goal:** {goal}")
# st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
# st.markdown(f"**Policy Impacy Area:** {impact}")
# st.markdown(f"**Key Provision:** {provision}")
# st.markdown(f"**Description:** {description}")
# st.markdown(f"**Summary:** {summary}")
# st.markdown(f"Trend Summary:{trend}")
# st.markdown(f"Actionable Insight:{insight}")
# st.markdown(f"[View Full Bill Text]({full_url})\n")
# st.divider()
# collected.append(row['summary_insight'])
# st.subheader("RAG-Generated Overall Summary")
# summary = rag_summarize(collected, summarizer)
# st.success(summary)
# import streamlit as st
# import pandas as pd
# import re
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer
# from datetime import datetime
# def clean_text(text):
# text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
# text = re.sub(r"(?i)let me know if you'd like.*", "", text)
# text = re.sub(r"(?i)trend summary[:]*", "", text)
# text = re.sub(r"(?i)actionable insight[:]*", "", text)
# return text.strip()
# @st.cache_data
# def load_data():
# df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
# df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
# df = df.dropna(subset=['status_date'])
# df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
# df["llama_insight"] = df["llama_insight"].fillna("")
# df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
# return df
# @st.cache_resource
# def load_models():
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
# return embed_model, summarizer
# @st.cache_data
# def compute_embeddings(texts, _model):
# return _model.encode(texts, show_progress_bar=True)
# def semantic_search(query, embeddings, model, threshold=0.5):
# query_embedding = model.encode([query])
# sims = cosine_similarity(query_embedding, embeddings)[0]
# return [(i, s) for i, s in enumerate(sims) if s > threshold]
# def rag_summarize(texts, summarizer, top_k=10): # increased from 5 to 10
# if not texts:
# return "No relevant content to summarize."
# vect = TfidfVectorizer()
# m = vect.fit_transform(texts)
# mean_vec = m.mean(axis=0).A
# scores = cosine_similarity(mean_vec, m).flatten()
# top_indices = scores.argsort()[::-1][:top_k]
# ctx = "\n".join(texts[i] for i in top_indices)
# prompt = "summarize: " + ctx[:1024]
# out = summarizer(prompt, max_length=150, min_length=80, do_sample=False) # updated length
# return out[0]['summary_text']
# def extract_month_year(q):
# month_map = {m: i for i, m in enumerate(
# ["january", "february", "march", "april", "may", "june",
# "july", "august", "september", "october", "november", "december"], 1)}
# ql = q.lower()
# mon = next((v for k, v in month_map.items() if k in ql), None)
# ym = re.search(r"(19|20)\d{2}", q)
# yr = int(ym.group()) if ym else None
# return mon, yr
# def extract_topic_match(query, df):
# query_lower = query.lower()
# return df[
# df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
# df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
# df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
# df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
# ]
# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
# st.title("Illinois Legislative Trends Q&A")
# st.markdown("Ask about trends in topics like higher education, funding, etc.")
# df = load_data()
# embed_model, summarizer = load_models()
# query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
# if query:
# mon, yr = extract_month_year(query)
# df2 = extract_topic_match(query, df)
# if df2.empty:
# df2 = df
# if yr:
# df2 = df2[df2['status_date'].dt.year == yr]
# if mon:
# df2 = df2[df2['status_date'].dt.month == mon]
# st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
# else:
# st.info(f" Filtering by year: **{yr}**")
# if df2.empty:
# st.warning("No matching records found.")
# else:
# texts = df2['summary_insight'].tolist()
# embs = compute_embeddings(texts, _model=embed_model)
# res = semantic_search(query, embs, embed_model, threshold=0.5)
# if not res:
# st.warning("No relevant insights found.")
# else:
# st.subheader("Top Matching Insights")
# collected = []
# for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # increased to 10
# row = df2.iloc[idx]
# date = row['status_date'].date()
# bill_number = row['bill_number']
# full_url = row['url']
# cat = row['Category & Subcategory']
# cat_std = row['category_&_subcategory_standardized2']
# bene= row['Intended Beneficiaries']
# bene_std= row['intended_beneficiaries_standardized2']
# goal = row['Legislative Goal']
# impact = row['Policy Impact Areas']
# provision = row['Key Provisions']
# intent = row['Intent']
# stance = row['Stance']
# description = row['description']
# summary = row['summary']
# trend = clean_text(row['llama_trend_summary'])
# insight = clean_text(row['llama_insight'])
# st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
# st.markdown(f"**Category:** {cat}")
# # st.markdown(f"**Category Std:** {cat_std}")
# st.markdown(f"**Intended Beneficiaries:** {bene}")
# # st.markdown(f"**Intended Beneficiaries STD:** {bene_std}")
# st.markdown(f"**Goal:** {goal}")
# st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
# st.markdown(f"**Policy Impact Area:** {impact}")
# st.markdown(f"**Key Provision:** {provision}")
# st.markdown(f"**Description:** {description}")
# # st.markdown(f"**Summary:** {summary}")
# st.markdown(f"**Trend Summary:** {trend}")
# st.markdown(f"**Actionable Insight:** {insight}")
# st.markdown(f"[View Full Bill Text]({full_url})\n")
# st.divider()
# collected.append(row['summary_insight'])
# st.subheader("RAG-Generated Overall Summary")
# summary = rag_summarize(collected, summarizer)
# st.success(summary)
#
# including description
# import streamlit as st
# import pandas as pd
# import re
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer
# from datetime import datetime
# def clean_text(text):
# text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
# text = re.sub(r"(?i)let me know if you'd like.*", "", text)
# text = re.sub(r"(?i)trend summary[:]*", "", text)
# text = re.sub(r"(?i)actionable insight[:]*", "", text)
# return text.strip()
# @st.cache_data
# def load_data():
# df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
# df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
# df = df.dropna(subset=['status_date'])
# df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
# df["llama_insight"] = df["llama_insight"].fillna("")
# df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
# return df
# @st.cache_resource
# def load_models():
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
# return embed_model, summarizer
# @st.cache_data
# def compute_embeddings(texts, _model):
# return _model.encode(texts, show_progress_bar=True)
# def semantic_search(query, embeddings, model, threshold=0.5):
# query_embedding = model.encode([query])
# sims = cosine_similarity(query_embedding, embeddings)[0]
# return [(i, s) for i, s in enumerate(sims) if s > threshold]
# def rag_summarize(texts, summarizer, top_k=10):
# if not texts:
# return "No relevant content to summarize."
# vect = TfidfVectorizer()
# m = vect.fit_transform(texts)
# mean_vec = m.mean(axis=0).A
# scores = cosine_similarity(mean_vec, m).flatten()
# top_indices = scores.argsort()[::-1][:top_k]
# ctx = "\n".join(texts[i] for i in top_indices)
# prompt = "summarize: " + ctx[:1024]
# out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
# return out[0]['summary_text']
# def extract_month_year(q):
# month_map = {m: i for i, m in enumerate(
# ["january", "february", "march", "april", "may", "june",
# "july", "august", "september", "october", "november", "december"], 1)}
# ql = q.lower()
# mon = next((v for k, v in month_map.items() if k in ql), None)
# ym = re.search(r"(19|20)\d{2}", q)
# yr = int(ym.group()) if ym else None
# return mon, yr
# def extract_date_range(query):
# month_map = {
# "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
# "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
# }
# patterns = [
# r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
# ]
# for pattern in patterns:
# match = re.search(pattern, query)
# if match:
# start_month_str, start_year = match.group(1).lower(), int(match.group(2))
# end_month_str, end_year = match.group(3).lower(), int(match.group(4))
# start_month = month_map.get(start_month_str)
# end_month = month_map.get(end_month_str)
# if start_month and end_month:
# start_date = datetime(start_year, start_month, 1)
# end_date = datetime(end_year, end_month, 28)
# return start_date, end_date
# return None, None
# def extract_topic_match(query, df):
# query_lower = query.lower()
# return df[
# df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
# df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
# df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
# df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
# ]
# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
# st.title("Illinois Legislative Trends Q&A")
# st.markdown("Ask about trends in topics like higher education, funding, etc.")
# df = load_data()
# embed_model, summarizer = load_models()
# query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
# if query:
# start_date, end_date = extract_date_range(query)
# df2 = extract_topic_match(query, df)
# if df2.empty:
# df2 = df
# if start_date and end_date:
# df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
# st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
# else:
# mon, yr = extract_month_year(query)
# if yr:
# df2 = df2[df2['status_date'].dt.year == yr]
# if mon:
# df2 = df2[df2['status_date'].dt.month == mon]
# st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
# else:
# st.info(f"Filtering by year: **{yr}**")
# if df2.empty:
# st.warning("No matching records found.")
# else:
# # Include description in embeddings + RAG
# texts = (df2['description'].fillna('') + "\n" + df2['summary_insight'].fillna('')).tolist()
# embs = compute_embeddings(texts, _model=embed_model)
# res = semantic_search(query, embs, embed_model, threshold=0.5)
# if not res:
# st.warning("No relevant insights found.")
# else:
# st.subheader("Top Matching Insights")
# collected = []
# for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
# row = df2.iloc[idx]
# date = row['status_date'].date()
# bill_number = row['bill_number']
# full_url = row['url']
# cat = row['Category & Subcategory']
# cat_std = row['category_&_subcategory_standardized2']
# bene = row['Intended Beneficiaries']
# bene_std = row['intended_beneficiaries_standardized2']
# goal = row['Legislative Goal']
# impact = row['Policy Impact Areas']
# provision = row['Key Provisions']
# intent = row['Intent']
# stance = row['Stance']
# description = row['description']
# summary = row['summary']
# trend = clean_text(row['llama_trend_summary'])
# insight = clean_text(row['llama_insight'])
# st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
# st.markdown(f"**Category:** {cat}")
# st.markdown(f"**Intended Beneficiaries:** {bene}")
# st.markdown(f"**Goal:** {goal}")
# st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
# st.markdown(f"**Policy Impact Area:** {impact}")
# st.markdown(f"**Key Provision:** {provision}")
# st.markdown(f"**Description:** {description}")
# st.markdown(f"**Trend Summary:** {trend}")
# st.markdown(f"**Actionable Insight:** {insight}")
# st.markdown(f"[View Full Bill Text]({full_url})\n")
# st.divider()
# collected.append(description + "\n" + row['summary_insight'])
# st.subheader("RAG-Generated Overall Summary")
# summary = rag_summarize(collected, summarizer)
# st.success(summary)
## NEW ONE
# import streamlit as st
# import pandas as pd
# import re
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer
# from datetime import datetime
# def clean_text(text):
# text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
# text = re.sub(r"(?i)let me know if you'd like.*", "", text)
# text = re.sub(r"(?i)trend summary[:]*", "", text)
# text = re.sub(r"(?i)actionable insight[:]*", "", text)
# return text.strip()
# @st.cache_data
# def load_data():
# df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
# df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
# df = df.dropna(subset=['status_date'])
# for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
# "Intended Beneficiaries", "Potential Impact", "description"]:
# df[col] = df[col].fillna("")
# df["combined_text"] = (
# "Legislative Goal: " + df["Legislative Goal"] + "\n" +
# "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
# "Key Provisions: " + df["Key Provisions"] + "\n" +
# "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
# "Potential Impact: " + df["Potential Impact"] + "\n" +
# "Description: " + df["description"]
# )
# return df
# @st.cache_resource
# def load_models():
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
# return embed_model, summarizer
# @st.cache_data
# def compute_embeddings(texts, _model):
# return _model.encode(texts, show_progress_bar=True)
# def semantic_search(query, embeddings, model, threshold=0.5):
# query_embedding = model.encode([query])
# sims = cosine_similarity(query_embedding, embeddings)[0]
# return [(i, s) for i, s in enumerate(sims) if s > threshold]
# def rag_summarize(texts, summarizer, top_k=5):
# if not texts:
# return "No relevant content to summarize."
# vect = TfidfVectorizer()
# m = vect.fit_transform(texts)
# mean_vec = m.mean(axis=0).A
# scores = cosine_similarity(mean_vec, m).flatten()
# top_indices = scores.argsort()[::-1][:top_k]
# ctx = "\n".join(texts[i] for i in top_indices)
# prompt = "summarize: " + ctx[:1024]
# out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
# return out[0]['summary_text']
# def extract_month_year(q):
# month_map = {m: i for i, m in enumerate(
# ["january", "february", "march", "april", "may", "june",
# "july", "august", "september", "october", "november", "december"], 1)}
# ql = q.lower()
# mon = next((v for k, v in month_map.items() if k in ql), None)
# ym = re.search(r"(19|20)\d{2}", q)
# yr = int(ym.group()) if ym else None
# return mon, yr
# def extract_date_range(query):
# month_map = {
# "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
# "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
# }
# patterns = [
# r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
# ]
# for pattern in patterns:
# match = re.search(pattern, query)
# if match:
# start_month_str, start_year = match.group(1).lower(), int(match.group(2))
# end_month_str, end_year = match.group(3).lower(), int(match.group(4))
# start_month = month_map.get(start_month_str)
# end_month = month_map.get(end_month_str)
# if start_month and end_month:
# start_date = datetime(start_year, start_month, 1)
# end_date = datetime(end_year, end_month, 28)
# return start_date, end_date
# return None, None
# def extract_topic_match(query, df):
# query_lower = query.lower()
# return df[
# df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
# df['Intent'].fillna('').str.lower().str.contains(query_lower) |
# df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
# df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
# df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
# df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
# ]
# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
# st.title("Illinois Legislative Trends Q&A")
# st.markdown("Ask about trends in topics like higher education, funding, etc.")
# df = load_data()
# embed_model, summarizer = load_models()
# query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
# if query:
# start_date, end_date = extract_date_range(query)
# df2 = extract_topic_match(query, df)
# if df2.empty:
# df2 = df
# if start_date and end_date:
# df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
# st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
# else:
# mon, yr = extract_month_year(query)
# if yr:
# df2 = df2[df2['status_date'].dt.year == yr]
# if mon:
# df2 = df2[df2['status_date'].dt.month == mon]
# st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
# else:
# st.info(f"Filtering by year: **{yr}**")
# if df2.empty:
# st.warning("No matching records found.")
# else:
# texts = df2['combined_text'].tolist()
# embs = compute_embeddings(texts, _model=embed_model)
# res = semantic_search(query, embs, embed_model, threshold=0.5)
# if not res:
# st.warning("No relevant insights found.")
# else:
# st.subheader("Top Matching Insights")
# collected = []
# for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
# row = df2.iloc[idx]
# date = row['status_date'].date()
# bill_number = row['bill_number']
# full_url = row['url']
# cat = row.get('Category & Subcategory', '')
# bene = row.get('Intended Beneficiaries', '')
# goal = row.get('Legislative Goal', '')
# impact = row.get('Policy Impact Areas', '')
# provision = row.get('Key Provisions', '')
# intent = row.get('Intent', '')
# stance = row.get('Stance', '')
# description = row.get('description', '')
# st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
# st.markdown(f"**Category:** {cat}")
# st.markdown(f"**Intended Beneficiaries:** {bene}")
# st.markdown(f"**Goal:** {goal}")
# st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
# st.markdown(f"**Policy Impact Area:** {impact}")
# st.markdown(f"**Key Provision:** {provision}")
# st.markdown(f"**Description:** {description}")
# st.markdown(f"[View Full Bill Text]({full_url})\n")
# st.divider()
# collected.append(row['combined_text'])
# st.subheader("RAG-Generated Overall Summary")
# summary = rag_summarize(collected, summarizer)
# st.success(summary)
#BART
import streamlit as st
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime
def clean_text(text):
text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
text = re.sub(r"(?i)let me know if you'd like.*", "", text)
text = re.sub(r"(?i)trend summary[:]*", "", text)
text = re.sub(r"(?i)actionable insight[:]*", "", text)
return text.strip()
@st.cache_data
def load_data():
df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
df = df.dropna(subset=['status_date'])
for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
"Intended Beneficiaries", "Potential Impact", "description"]:
df[col] = df[col].fillna("")
df["combined_text"] = (
"Legislative Goal: " + df["Legislative Goal"] + "\n" +
"Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
"Key Provisions: " + df["Key Provisions"] + "\n" +
"Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
"Potential Impact: " + df["Potential Impact"] + "\n" +
"Description: " + df["description"]
)
return df
@st.cache_resource
def load_models():
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# Changed summarization model to facebook/bart-large-cnn for better summary quality
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
return embed_model, summarizer
@st.cache_data
def compute_embeddings(texts, _model):
return _model.encode(texts, show_progress_bar=True)
def semantic_search(query, embeddings, model, threshold=0.5):
query_embedding = model.encode([query])
sims = cosine_similarity(query_embedding, embeddings)[0]
return [(i, s) for i, s in enumerate(sims) if s > threshold]
def rag_summarize(texts, summarizer, top_k=5):
if not texts:
return "No relevant content to summarize."
vect = TfidfVectorizer()
m = vect.fit_transform(texts)
mean_vec = m.mean(axis=0).A
scores = cosine_similarity(mean_vec, m).flatten()
top_indices = scores.argsort()[::-1][:top_k]
ctx = "\n".join(texts[i] for i in top_indices)
prompt = "summarize: " + ctx[:1024]
out = summarizer(prompt, max_length=250, min_length=80, do_sample=False)
return out[0]['summary_text']
def extract_month_year(q):
month_map = {m: i for i, m in enumerate(
["january", "february", "march", "april", "may", "june",
"july", "august", "september", "october", "november", "december"], 1)}
ql = q.lower()
mon = next((v for k, v in month_map.items() if k in ql), None)
ym = re.search(r"(19|20)\d{2}", q)
yr = int(ym.group()) if ym else None
return mon, yr
def extract_date_range(query):
month_map = {
"january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
"july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
}
patterns = [
r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
]
for pattern in patterns:
match = re.search(pattern, query)
if match:
start_month_str, start_year = match.group(1).lower(), int(match.group(2))
end_month_str, end_year = match.group(3).lower(), int(match.group(4))
start_month = month_map.get(start_month_str)
end_month = month_map.get(end_month_str)
if start_month and end_month:
start_date = datetime(start_year, start_month, 1)
end_date = datetime(end_year, end_month, 28)
return start_date, end_date
return None, None
def extract_topic_match(query, df):
query_lower = query.lower()
return df[
df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
df['Intent'].fillna('').str.lower().str.contains(query_lower) |
df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
]
st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
st.title("Illinois Legislative Trends Q&A")
st.markdown("Ask about trends in topics like higher education, funding, etc.")
df = load_data()
embed_model, summarizer = load_models()
query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
if query:
start_date, end_date = extract_date_range(query)
df2 = extract_topic_match(query, df)
if df2.empty:
df2 = df
if start_date and end_date:
df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
else:
mon, yr = extract_month_year(query)
if yr:
df2 = df2[df2['status_date'].dt.year == yr]
if mon:
df2 = df2[df2['status_date'].dt.month == mon]
st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
else:
st.info(f"Filtering by year: **{yr}**")
if df2.empty:
st.warning("No matching records found.")
else:
texts = df2['combined_text'].tolist()
embs = compute_embeddings(texts, _model=embed_model)
res = semantic_search(query, embs, embed_model, threshold=0.5)
if not res:
st.warning("No relevant insights found.")
else:
st.subheader("Top Matching Insights")
collected = []
for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
row = df2.iloc[idx]
date = row['status_date'].date()
bill_number = row['bill_number']
full_url = row['url']
cat = row.get('Category & Subcategory', '')
bene = row.get('Intended Beneficiaries', '')
goal = row.get('Legislative Goal', '')
impact = row.get('Policy Impact Areas', '')
provision = row.get('Key Provisions', '')
intent = row.get('Intent', '')
stance = row.get('Stance', '')
description = row.get('description', '')
st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
st.markdown(f"**Category:** {cat}")
st.markdown(f"**Intended Beneficiaries:** {bene}")
st.markdown(f"**Goal:** {goal}")
st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
st.markdown(f"**Policy Impact Area:** {impact}")
st.markdown(f"**Key Provision:** {provision}")
st.markdown(f"**Description:** {description}")
st.markdown(f"[View Full Bill Text]({full_url})\n")
st.divider()
collected.append(row['combined_text'])
st.subheader("RAG-Generated Overall Summary")
summary = rag_summarize(collected, summarizer)
st.success(summary)