Spaces:

Legislation
/

RAG

Runtime error

App Files Files Community

RAG / app.py

tjl8

Update app.py

9e33377 verified 9 months ago

raw

history blame contribute delete

36.5 kB

	# import streamlit as st
	# import pandas as pd
	# import re
	# from sentence_transformers import SentenceTransformer
	# from transformers import pipeline
	# from sklearn.metrics.pairwise import cosine_similarity
	# from sklearn.feature_extraction.text import TfidfVectorizer
	# from datetime import datetime

	# def clean_text(text):
	# text = re.sub(r"(?i)(here is\|here are) the requested output[s][:]", "", text)
	# text = re.sub(r"(?i)let me know if you'd like.*", "", text)
	# text = re.sub(r"(?i)trend summary[:]*", "", text)
	# text = re.sub(r"(?i)actionable insight[:]*", "", text)
	# return text.strip()

	# @st.cache_data
	# def load_data():
	# df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
	# df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
	# df = df.dropna(subset=['status_date'])
	# df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
	# df["llama_insight"] = df["llama_insight"].fillna("")
	# df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
	# return df

	# @st.cache_resource
	# def load_models():
	# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
	# summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
	# return embed_model, summarizer

	# @st.cache_data
	# def compute_embeddings(texts, _model):
	# return _model.encode(texts, show_progress_bar=True)

	# def semantic_search(query, embeddings, model, threshold=0.5):
	# query_embedding = model.encode([query])
	# sims = cosine_similarity(query_embedding, embeddings)[0]
	# return [(i, s) for i, s in enumerate(sims) if s > threshold]

	# def rag_summarize(texts, summarizer, top_k=5):
	# if not texts:
	# return "No relevant content to summarize."
	# vect = TfidfVectorizer()
	# m = vect.fit_transform(texts)
	# mean_vec = m.mean(axis=0).A
	# scores = cosine_similarity(mean_vec, m).flatten()
	# top_indices = scores.argsort()[::-1][:top_k]
	# ctx = "\n".join(texts[i] for i in top_indices)
	# prompt = "summarize: " + ctx[:1024]
	# out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
	# return out[0]['summary_text']

	# def extract_month_year(q):
	# month_map = {m: i for i, m in enumerate(
	# ["january", "february", "march", "april", "may", "june",
	# "july", "august", "september", "october", "november", "december"], 1)}
	# ql = q.lower()
	# mon = next((v for k, v in month_map.items() if k in ql), None)
	# ym = re.search(r"(19\|20)\d{2}", q)
	# yr = int(ym.group()) if ym else None
	# return mon, yr

	# def extract_topic_match(query, df):
	# query_lower = query.lower()
	# return df[
	# df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) \|
	# df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) \|
	# df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) \|
	# df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
	# ]

	# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
	# st.title("Illinois Legislative Trends Q&A")
	# st.markdown("Ask about trends in topics like higher education, funding, etc.")

	# df = load_data()
	# embed_model, summarizer = load_models()

	# query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")

	# if query:
	# mon, yr = extract_month_year(query)
	# df2 = extract_topic_match(query, df)

	# if df2.empty:
	# df2 = df
	# if yr:
	# df2 = df2[df2['status_date'].dt.year == yr]
	# if mon:
	# df2 = df2[df2['status_date'].dt.month == mon]
	# st.info(f" Filtering by date: {datetime(yr, mon, 1):%B %Y}")
	# else:
	# st.info(f" Filtering by year: {yr}")

	# if df2.empty:
	# st.warning("No matching records found.")
	# else:
	# texts = df2['summary_insight'].tolist()
	# embs = compute_embeddings(texts, _model=embed_model)
	# res = semantic_search(query, embs, embed_model, threshold=0.5)

	# if not res:
	# st.warning("No relevant insights found.")
	# else:
	# st.subheader(" Top Matching Insights")
	# collected = []

	# for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
	# row = df2.iloc[idx]
	# date = row['status_date'].date()
	# bill_number = row['bill_number']
	# full_url = row['url']
	# cat = row['Category & Subcategory']
	# cat_std = row['category_&_subcategory_standardized2']
	# bene= row['Intended Beneficiaries']
	# bene_std= row['intended_beneficiaries_standardized2']
	# goal = row['Legislative Goal']
	# impact = row['Policy Impact Areas']
	# provision = row['Key Provisions']
	# intent = row['Intent']
	# stance = row['Stance']
	# description = row['description']
	# summary = row['summary']

	# trend = clean_text(row['llama_trend_summary'])
	# insight = clean_text(row['llama_insight'])

	# st.markdown(f"Date: {date} \| Bill Number: {bill_number} \| Score: {score:.2f}")
	# st.markdown(f"Category: {cat}")
	# st.markdown(f"Category Std: {cat_std}")
	# st.markdown(f" Intended Beneficiaries: {bene}")
	# st.markdown(f" Intended Beneficiaries STD: {bene_std}")
	# st.markdown(f"Goal: {goal}")
	# st.markdown(f"Intent: {intent} \| Stance: {stance}")
	# st.markdown(f"Policy Impacy Area: {impact}")
	# st.markdown(f"Key Provision: {provision}")
	# st.markdown(f"Description: {description}")
	# st.markdown(f"Summary: {summary}")
	# st.markdown(f"Trend Summary:{trend}")
	# st.markdown(f"Actionable Insight:{insight}")
	# st.markdown(f"[View Full Bill Text]({full_url})\n")
	# st.divider()

	# collected.append(row['summary_insight'])

	# st.subheader("RAG-Generated Overall Summary")
	# summary = rag_summarize(collected, summarizer)
	# st.success(summary)
	# import streamlit as st
	# import pandas as pd
	# import re
	# from sentence_transformers import SentenceTransformer
	# from transformers import pipeline
	# from sklearn.metrics.pairwise import cosine_similarity
	# from sklearn.feature_extraction.text import TfidfVectorizer
	# from datetime import datetime

	# def clean_text(text):
	# text = re.sub(r"(?i)(here is\|here are) the requested output[s][:]", "", text)
	# text = re.sub(r"(?i)let me know if you'd like.*", "", text)
	# text = re.sub(r"(?i)trend summary[:]*", "", text)
	# text = re.sub(r"(?i)actionable insight[:]*", "", text)
	# return text.strip()

	# @st.cache_data
	# def load_data():
	# df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
	# df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
	# df = df.dropna(subset=['status_date'])
	# df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
	# df["llama_insight"] = df["llama_insight"].fillna("")
	# df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
	# return df

	# @st.cache_resource
	# def load_models():
	# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
	# summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
	# return embed_model, summarizer

	# @st.cache_data
	# def compute_embeddings(texts, _model):
	# return _model.encode(texts, show_progress_bar=True)

	# def semantic_search(query, embeddings, model, threshold=0.5):
	# query_embedding = model.encode([query])
	# sims = cosine_similarity(query_embedding, embeddings)[0]
	# return [(i, s) for i, s in enumerate(sims) if s > threshold]

	# def rag_summarize(texts, summarizer, top_k=10): # increased from 5 to 10
	# if not texts:
	# return "No relevant content to summarize."
	# vect = TfidfVectorizer()
	# m = vect.fit_transform(texts)
	# mean_vec = m.mean(axis=0).A
	# scores = cosine_similarity(mean_vec, m).flatten()
	# top_indices = scores.argsort()[::-1][:top_k]
	# ctx = "\n".join(texts[i] for i in top_indices)
	# prompt = "summarize: " + ctx[:1024]
	# out = summarizer(prompt, max_length=150, min_length=80, do_sample=False) # updated length
	# return out[0]['summary_text']

	# def extract_month_year(q):
	# month_map = {m: i for i, m in enumerate(
	# ["january", "february", "march", "april", "may", "june",
	# "july", "august", "september", "october", "november", "december"], 1)}
	# ql = q.lower()
	# mon = next((v for k, v in month_map.items() if k in ql), None)
	# ym = re.search(r"(19\|20)\d{2}", q)
	# yr = int(ym.group()) if ym else None
	# return mon, yr

	# def extract_topic_match(query, df):
	# query_lower = query.lower()
	# return df[
	# df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) \|
	# df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) \|
	# df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) \|
	# df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
	# ]

	# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
	# st.title("Illinois Legislative Trends Q&A")
	# st.markdown("Ask about trends in topics like higher education, funding, etc.")

	# df = load_data()
	# embed_model, summarizer = load_models()

	# query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")

	# if query:
	# mon, yr = extract_month_year(query)
	# df2 = extract_topic_match(query, df)

	# if df2.empty:
	# df2 = df
	# if yr:
	# df2 = df2[df2['status_date'].dt.year == yr]
	# if mon:
	# df2 = df2[df2['status_date'].dt.month == mon]
	# st.info(f" Filtering by date: {datetime(yr, mon, 1):%B %Y}")
	# else:
	# st.info(f" Filtering by year: {yr}")

	# if df2.empty:
	# st.warning("No matching records found.")
	# else:
	# texts = df2['summary_insight'].tolist()
	# embs = compute_embeddings(texts, _model=embed_model)
	# res = semantic_search(query, embs, embed_model, threshold=0.5)

	# if not res:
	# st.warning("No relevant insights found.")
	# else:
	# st.subheader("Top Matching Insights")
	# collected = []

	# for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # increased to 10
	# row = df2.iloc[idx]
	# date = row['status_date'].date()
	# bill_number = row['bill_number']
	# full_url = row['url']
	# cat = row['Category & Subcategory']
	# cat_std = row['category_&_subcategory_standardized2']
	# bene= row['Intended Beneficiaries']
	# bene_std= row['intended_beneficiaries_standardized2']
	# goal = row['Legislative Goal']
	# impact = row['Policy Impact Areas']
	# provision = row['Key Provisions']
	# intent = row['Intent']
	# stance = row['Stance']
	# description = row['description']
	# summary = row['summary']

	# trend = clean_text(row['llama_trend_summary'])
	# insight = clean_text(row['llama_insight'])

	# st.markdown(f"Date: {date} \| Bill Number: {bill_number} \| Score: {score:.2f}")
	# st.markdown(f"Category: {cat}")
	# # st.markdown(f"Category Std: {cat_std}")
	# st.markdown(f"Intended Beneficiaries: {bene}")
	# # st.markdown(f"Intended Beneficiaries STD: {bene_std}")
	# st.markdown(f"Goal: {goal}")
	# st.markdown(f"Intent: {intent} \| Stance: {stance}")
	# st.markdown(f"Policy Impact Area: {impact}")
	# st.markdown(f"Key Provision: {provision}")
	# st.markdown(f"Description: {description}")
	# # st.markdown(f"Summary: {summary}")
	# st.markdown(f"Trend Summary: {trend}")
	# st.markdown(f"Actionable Insight: {insight}")
	# st.markdown(f"[View Full Bill Text]({full_url})\n")
	# st.divider()

	# collected.append(row['summary_insight'])

	# st.subheader("RAG-Generated Overall Summary")
	# summary = rag_summarize(collected, summarizer)
	# st.success(summary)

	#

	# including description
	# import streamlit as st
	# import pandas as pd
	# import re
	# from sentence_transformers import SentenceTransformer
	# from transformers import pipeline
	# from sklearn.metrics.pairwise import cosine_similarity
	# from sklearn.feature_extraction.text import TfidfVectorizer
	# from datetime import datetime

	# def clean_text(text):
	# text = re.sub(r"(?i)(here is\|here are) the requested output[s][:]", "", text)
	# text = re.sub(r"(?i)let me know if you'd like.*", "", text)
	# text = re.sub(r"(?i)trend summary[:]*", "", text)
	# text = re.sub(r"(?i)actionable insight[:]*", "", text)
	# return text.strip()

	# @st.cache_data
	# def load_data():
	# df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
	# df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
	# df = df.dropna(subset=['status_date'])
	# df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
	# df["llama_insight"] = df["llama_insight"].fillna("")
	# df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
	# return df

	# @st.cache_resource
	# def load_models():
	# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
	# summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
	# return embed_model, summarizer

	# @st.cache_data
	# def compute_embeddings(texts, _model):
	# return _model.encode(texts, show_progress_bar=True)

	# def semantic_search(query, embeddings, model, threshold=0.5):
	# query_embedding = model.encode([query])
	# sims = cosine_similarity(query_embedding, embeddings)[0]
	# return [(i, s) for i, s in enumerate(sims) if s > threshold]

	# def rag_summarize(texts, summarizer, top_k=10):
	# if not texts:
	# return "No relevant content to summarize."
	# vect = TfidfVectorizer()
	# m = vect.fit_transform(texts)
	# mean_vec = m.mean(axis=0).A
	# scores = cosine_similarity(mean_vec, m).flatten()
	# top_indices = scores.argsort()[::-1][:top_k]
	# ctx = "\n".join(texts[i] for i in top_indices)
	# prompt = "summarize: " + ctx[:1024]
	# out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
	# return out[0]['summary_text']

	# def extract_month_year(q):
	# month_map = {m: i for i, m in enumerate(
	# ["january", "february", "march", "april", "may", "june",
	# "july", "august", "september", "october", "november", "december"], 1)}
	# ql = q.lower()
	# mon = next((v for k, v in month_map.items() if k in ql), None)
	# ym = re.search(r"(19\|20)\d{2}", q)
	# yr = int(ym.group()) if ym else None
	# return mon, yr

	# def extract_date_range(query):
	# month_map = {
	# "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
	# "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
	# }

	# patterns = [
	# r"(?i)(?:from\|between)?\s([a-zA-Z]+)\s+(\d{4})\s(?:to\|through\|and\|-)\s*([a-zA-Z]+)\s+(\d{4})",
	# ]

	# for pattern in patterns:
	# match = re.search(pattern, query)
	# if match:
	# start_month_str, start_year = match.group(1).lower(), int(match.group(2))
	# end_month_str, end_year = match.group(3).lower(), int(match.group(4))

	# start_month = month_map.get(start_month_str)
	# end_month = month_map.get(end_month_str)

	# if start_month and end_month:
	# start_date = datetime(start_year, start_month, 1)
	# end_date = datetime(end_year, end_month, 28)
	# return start_date, end_date

	# return None, None

	# def extract_topic_match(query, df):
	# query_lower = query.lower()
	# return df[
	# df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) \|
	# df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) \|
	# df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) \|
	# df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
	# ]

	# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
	# st.title("Illinois Legislative Trends Q&A")
	# st.markdown("Ask about trends in topics like higher education, funding, etc.")

	# df = load_data()
	# embed_model, summarizer = load_models()

	# query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")

	# if query:
	# start_date, end_date = extract_date_range(query)
	# df2 = extract_topic_match(query, df)

	# if df2.empty:
	# df2 = df

	# if start_date and end_date:
	# df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
	# st.info(f"Filtering between: {start_date:%B %Y} and {end_date:%B %Y}")
	# else:
	# mon, yr = extract_month_year(query)
	# if yr:
	# df2 = df2[df2['status_date'].dt.year == yr]
	# if mon:
	# df2 = df2[df2['status_date'].dt.month == mon]
	# st.info(f"Filtering by date: {datetime(yr, mon, 1):%B %Y}")
	# else:
	# st.info(f"Filtering by year: {yr}")

	# if df2.empty:
	# st.warning("No matching records found.")
	# else:
	# # Include description in embeddings + RAG
	# texts = (df2['description'].fillna('') + "\n" + df2['summary_insight'].fillna('')).tolist()
	# embs = compute_embeddings(texts, _model=embed_model)
	# res = semantic_search(query, embs, embed_model, threshold=0.5)

	# if not res:
	# st.warning("No relevant insights found.")
	# else:
	# st.subheader("Top Matching Insights")
	# collected = []

	# for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
	# row = df2.iloc[idx]
	# date = row['status_date'].date()
	# bill_number = row['bill_number']
	# full_url = row['url']
	# cat = row['Category & Subcategory']
	# cat_std = row['category_&_subcategory_standardized2']
	# bene = row['Intended Beneficiaries']
	# bene_std = row['intended_beneficiaries_standardized2']
	# goal = row['Legislative Goal']
	# impact = row['Policy Impact Areas']
	# provision = row['Key Provisions']
	# intent = row['Intent']
	# stance = row['Stance']
	# description = row['description']
	# summary = row['summary']
	# trend = clean_text(row['llama_trend_summary'])
	# insight = clean_text(row['llama_insight'])

	# st.markdown(f"Date: {date} \| Bill Number: {bill_number} \| Score: {score:.2f}")
	# st.markdown(f"Category: {cat}")
	# st.markdown(f"Intended Beneficiaries: {bene}")
	# st.markdown(f"Goal: {goal}")
	# st.markdown(f"Intent: {intent} \| Stance: {stance}")
	# st.markdown(f"Policy Impact Area: {impact}")
	# st.markdown(f"Key Provision: {provision}")
	# st.markdown(f"Description: {description}")
	# st.markdown(f"Trend Summary: {trend}")
	# st.markdown(f"Actionable Insight: {insight}")
	# st.markdown(f"[View Full Bill Text]({full_url})\n")
	# st.divider()

	# collected.append(description + "\n" + row['summary_insight'])

	# st.subheader("RAG-Generated Overall Summary")
	# summary = rag_summarize(collected, summarizer)
	# st.success(summary)


	## NEW ONE

	# import streamlit as st
	# import pandas as pd
	# import re
	# from sentence_transformers import SentenceTransformer
	# from transformers import pipeline
	# from sklearn.metrics.pairwise import cosine_similarity
	# from sklearn.feature_extraction.text import TfidfVectorizer
	# from datetime import datetime

	# def clean_text(text):
	# text = re.sub(r"(?i)(here is\|here are) the requested output[s][:]", "", text)
	# text = re.sub(r"(?i)let me know if you'd like.*", "", text)
	# text = re.sub(r"(?i)trend summary[:]*", "", text)
	# text = re.sub(r"(?i)actionable insight[:]*", "", text)
	# return text.strip()

	# @st.cache_data
	# def load_data():
	# df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
	# df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
	# df = df.dropna(subset=['status_date'])

	# for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
	# "Intended Beneficiaries", "Potential Impact", "description"]:
	# df[col] = df[col].fillna("")

	# df["combined_text"] = (
	# "Legislative Goal: " + df["Legislative Goal"] + "\n" +
	# "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
	# "Key Provisions: " + df["Key Provisions"] + "\n" +
	# "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
	# "Potential Impact: " + df["Potential Impact"] + "\n" +
	# "Description: " + df["description"]
	# )

	# return df

	# @st.cache_resource
	# def load_models():
	# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
	# summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
	# return embed_model, summarizer

	# @st.cache_data
	# def compute_embeddings(texts, _model):
	# return _model.encode(texts, show_progress_bar=True)

	# def semantic_search(query, embeddings, model, threshold=0.5):
	# query_embedding = model.encode([query])
	# sims = cosine_similarity(query_embedding, embeddings)[0]
	# return [(i, s) for i, s in enumerate(sims) if s > threshold]

	# def rag_summarize(texts, summarizer, top_k=5):
	# if not texts:
	# return "No relevant content to summarize."
	# vect = TfidfVectorizer()
	# m = vect.fit_transform(texts)
	# mean_vec = m.mean(axis=0).A
	# scores = cosine_similarity(mean_vec, m).flatten()
	# top_indices = scores.argsort()[::-1][:top_k]
	# ctx = "\n".join(texts[i] for i in top_indices)
	# prompt = "summarize: " + ctx[:1024]
	# out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
	# return out[0]['summary_text']

	# def extract_month_year(q):
	# month_map = {m: i for i, m in enumerate(
	# ["january", "february", "march", "april", "may", "june",
	# "july", "august", "september", "october", "november", "december"], 1)}
	# ql = q.lower()
	# mon = next((v for k, v in month_map.items() if k in ql), None)
	# ym = re.search(r"(19\|20)\d{2}", q)
	# yr = int(ym.group()) if ym else None
	# return mon, yr

	# def extract_date_range(query):
	# month_map = {
	# "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
	# "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
	# }

	# patterns = [
	# r"(?i)(?:from\|between)?\s([a-zA-Z]+)\s+(\d{4})\s(?:to\|through\|and\|-)\s*([a-zA-Z]+)\s+(\d{4})",
	# ]

	# for pattern in patterns:
	# match = re.search(pattern, query)
	# if match:
	# start_month_str, start_year = match.group(1).lower(), int(match.group(2))
	# end_month_str, end_year = match.group(3).lower(), int(match.group(4))

	# start_month = month_map.get(start_month_str)
	# end_month = month_map.get(end_month_str)

	# if start_month and end_month:
	# start_date = datetime(start_year, start_month, 1)
	# end_date = datetime(end_year, end_month, 28)
	# return start_date, end_date

	# return None, None


	# def extract_topic_match(query, df):
	# query_lower = query.lower()
	# return df[
	# df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) \|
	# df['Intent'].fillna('').str.lower().str.contains(query_lower) \|
	# df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) \|
	# df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) \|
	# df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) \|
	# df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
	# ]


	# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
	# st.title("Illinois Legislative Trends Q&A")
	# st.markdown("Ask about trends in topics like higher education, funding, etc.")

	# df = load_data()
	# embed_model, summarizer = load_models()

	# query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")

	# if query:
	# start_date, end_date = extract_date_range(query)
	# df2 = extract_topic_match(query, df)

	# if df2.empty:
	# df2 = df

	# if start_date and end_date:
	# df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
	# st.info(f"Filtering between: {start_date:%B %Y} and {end_date:%B %Y}")
	# else:
	# mon, yr = extract_month_year(query)
	# if yr:
	# df2 = df2[df2['status_date'].dt.year == yr]
	# if mon:
	# df2 = df2[df2['status_date'].dt.month == mon]
	# st.info(f"Filtering by date: {datetime(yr, mon, 1):%B %Y}")
	# else:
	# st.info(f"Filtering by year: {yr}")

	# if df2.empty:
	# st.warning("No matching records found.")
	# else:
	# texts = df2['combined_text'].tolist()
	# embs = compute_embeddings(texts, _model=embed_model)
	# res = semantic_search(query, embs, embed_model, threshold=0.5)

	# if not res:
	# st.warning("No relevant insights found.")
	# else:
	# st.subheader("Top Matching Insights")
	# collected = []

	# for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
	# row = df2.iloc[idx]
	# date = row['status_date'].date()
	# bill_number = row['bill_number']
	# full_url = row['url']
	# cat = row.get('Category & Subcategory', '')
	# bene = row.get('Intended Beneficiaries', '')
	# goal = row.get('Legislative Goal', '')
	# impact = row.get('Policy Impact Areas', '')
	# provision = row.get('Key Provisions', '')
	# intent = row.get('Intent', '')
	# stance = row.get('Stance', '')
	# description = row.get('description', '')

	# st.markdown(f"Date: {date} \| Bill Number: {bill_number} \| Score: {score:.2f}")
	# st.markdown(f"Category: {cat}")
	# st.markdown(f"Intended Beneficiaries: {bene}")
	# st.markdown(f"Goal: {goal}")
	# st.markdown(f"Intent: {intent} \| Stance: {stance}")
	# st.markdown(f"Policy Impact Area: {impact}")
	# st.markdown(f"Key Provision: {provision}")
	# st.markdown(f"Description: {description}")
	# st.markdown(f"[View Full Bill Text]({full_url})\n")
	# st.divider()

	# collected.append(row['combined_text'])

	# st.subheader("RAG-Generated Overall Summary")
	# summary = rag_summarize(collected, summarizer)
	# st.success(summary)


	#BART
	import streamlit as st
	import pandas as pd
	import re
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.feature_extraction.text import TfidfVectorizer
	from datetime import datetime

	def clean_text(text):
	text = re.sub(r"(?i)(here is\|here are) the requested output[s][:]", "", text)
	text = re.sub(r"(?i)let me know if you'd like.*", "", text)
	text = re.sub(r"(?i)trend summary[:]*", "", text)
	text = re.sub(r"(?i)actionable insight[:]*", "", text)
	return text.strip()

	@st.cache_data
	def load_data():
	df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
	df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
	df = df.dropna(subset=['status_date'])

	for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
	"Intended Beneficiaries", "Potential Impact", "description"]:
	df[col] = df[col].fillna("")

	df["combined_text"] = (
	"Legislative Goal: " + df["Legislative Goal"] + "\n" +
	"Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
	"Key Provisions: " + df["Key Provisions"] + "\n" +
	"Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
	"Potential Impact: " + df["Potential Impact"] + "\n" +
	"Description: " + df["description"]
	)

	return df

	@st.cache_resource
	def load_models():
	embed_model = SentenceTransformer('all-MiniLM-L6-v2')
	# Changed summarization model to facebook/bart-large-cnn for better summary quality
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
	return embed_model, summarizer

	@st.cache_data
	def compute_embeddings(texts, _model):
	return _model.encode(texts, show_progress_bar=True)

	def semantic_search(query, embeddings, model, threshold=0.5):
	query_embedding = model.encode([query])
	sims = cosine_similarity(query_embedding, embeddings)[0]
	return [(i, s) for i, s in enumerate(sims) if s > threshold]

	def rag_summarize(texts, summarizer, top_k=5):
	if not texts:
	return "No relevant content to summarize."
	vect = TfidfVectorizer()
	m = vect.fit_transform(texts)
	mean_vec = m.mean(axis=0).A
	scores = cosine_similarity(mean_vec, m).flatten()
	top_indices = scores.argsort()[::-1][:top_k]
	ctx = "\n".join(texts[i] for i in top_indices)
	prompt = "summarize: " + ctx[:1024]
	out = summarizer(prompt, max_length=250, min_length=80, do_sample=False)
	return out[0]['summary_text']

	def extract_month_year(q):
	month_map = {m: i for i, m in enumerate(
	["january", "february", "march", "april", "may", "june",
	"july", "august", "september", "october", "november", "december"], 1)}
	ql = q.lower()
	mon = next((v for k, v in month_map.items() if k in ql), None)
	ym = re.search(r"(19\|20)\d{2}", q)
	yr = int(ym.group()) if ym else None
	return mon, yr

	def extract_date_range(query):
	month_map = {
	"january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
	"july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
	}

	patterns = [
	r"(?i)(?:from\|between)?\s([a-zA-Z]+)\s+(\d{4})\s(?:to\|through\|and\|-)\s*([a-zA-Z]+)\s+(\d{4})",
	]

	for pattern in patterns:
	match = re.search(pattern, query)
	if match:
	start_month_str, start_year = match.group(1).lower(), int(match.group(2))
	end_month_str, end_year = match.group(3).lower(), int(match.group(4))

	start_month = month_map.get(start_month_str)
	end_month = month_map.get(end_month_str)

	if start_month and end_month:
	start_date = datetime(start_year, start_month, 1)
	end_date = datetime(end_year, end_month, 28)
	return start_date, end_date

	return None, None


	def extract_topic_match(query, df):
	query_lower = query.lower()
	return df[
	df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) \|
	df['Intent'].fillna('').str.lower().str.contains(query_lower) \|
	df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) \|
	df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) \|
	df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) \|
	df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
	]


	st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
	st.title("Illinois Legislative Trends Q&A")
	st.markdown("Ask about trends in topics like higher education, funding, etc.")

	df = load_data()
	embed_model, summarizer = load_models()

	query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")

	if query:
	start_date, end_date = extract_date_range(query)
	df2 = extract_topic_match(query, df)

	if df2.empty:
	df2 = df

	if start_date and end_date:
	df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
	st.info(f"Filtering between: {start_date:%B %Y} and {end_date:%B %Y}")
	else:
	mon, yr = extract_month_year(query)
	if yr:
	df2 = df2[df2['status_date'].dt.year == yr]
	if mon:
	df2 = df2[df2['status_date'].dt.month == mon]
	st.info(f"Filtering by date: {datetime(yr, mon, 1):%B %Y}")
	else:
	st.info(f"Filtering by year: {yr}")

	if df2.empty:
	st.warning("No matching records found.")
	else:
	texts = df2['combined_text'].tolist()
	embs = compute_embeddings(texts, _model=embed_model)
	res = semantic_search(query, embs, embed_model, threshold=0.5)

	if not res:
	st.warning("No relevant insights found.")
	else:
	st.subheader("Top Matching Insights")
	collected = []

	for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
	row = df2.iloc[idx]
	date = row['status_date'].date()
	bill_number = row['bill_number']
	full_url = row['url']
	cat = row.get('Category & Subcategory', '')
	bene = row.get('Intended Beneficiaries', '')
	goal = row.get('Legislative Goal', '')
	impact = row.get('Policy Impact Areas', '')
	provision = row.get('Key Provisions', '')
	intent = row.get('Intent', '')
	stance = row.get('Stance', '')
	description = row.get('description', '')

	st.markdown(f"Date: {date} \| Bill Number: {bill_number} \| Score: {score:.2f}")
	st.markdown(f"Category: {cat}")
	st.markdown(f"Intended Beneficiaries: {bene}")
	st.markdown(f"Goal: {goal}")
	st.markdown(f"Intent: {intent} \| Stance: {stance}")
	st.markdown(f"Policy Impact Area: {impact}")
	st.markdown(f"Key Provision: {provision}")
	st.markdown(f"Description: {description}")
	st.markdown(f"[View Full Bill Text]({full_url})\n")
	st.divider()

	collected.append(row['combined_text'])

	st.subheader("RAG-Generated Overall Summary")
	summary = rag_summarize(collected, summarizer)
	st.success(summary)