Spaces:
Runtime error
Runtime error
| # import streamlit as st | |
| # import pandas as pd | |
| # import re | |
| # from sentence_transformers import SentenceTransformer | |
| # from transformers import pipeline | |
| # from sklearn.metrics.pairwise import cosine_similarity | |
| # from sklearn.feature_extraction.text import TfidfVectorizer | |
| # from datetime import datetime | |
| # def clean_text(text): | |
| # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) | |
| # text = re.sub(r"(?i)let me know if you'd like.*", "", text) | |
| # text = re.sub(r"(?i)trend summary[:]*", "", text) | |
| # text = re.sub(r"(?i)actionable insight[:]*", "", text) | |
| # return text.strip() | |
| # @st.cache_data | |
| # def load_data(): | |
| # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv") | |
| # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') | |
| # df = df.dropna(subset=['status_date']) | |
| # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("") | |
| # df["llama_insight"] = df["llama_insight"].fillna("") | |
| # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"] | |
| # return df | |
| # @st.cache_resource | |
| # def load_models(): | |
| # embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") | |
| # return embed_model, summarizer | |
| # @st.cache_data | |
| # def compute_embeddings(texts, _model): | |
| # return _model.encode(texts, show_progress_bar=True) | |
| # def semantic_search(query, embeddings, model, threshold=0.5): | |
| # query_embedding = model.encode([query]) | |
| # sims = cosine_similarity(query_embedding, embeddings)[0] | |
| # return [(i, s) for i, s in enumerate(sims) if s > threshold] | |
| # def rag_summarize(texts, summarizer, top_k=5): | |
| # if not texts: | |
| # return "No relevant content to summarize." | |
| # vect = TfidfVectorizer() | |
| # m = vect.fit_transform(texts) | |
| # mean_vec = m.mean(axis=0).A | |
| # scores = cosine_similarity(mean_vec, m).flatten() | |
| # top_indices = scores.argsort()[::-1][:top_k] | |
| # ctx = "\n".join(texts[i] for i in top_indices) | |
| # prompt = "summarize: " + ctx[:1024] | |
| # out = summarizer(prompt, max_length=60, min_length=30, do_sample=False) | |
| # return out[0]['summary_text'] | |
| # def extract_month_year(q): | |
| # month_map = {m: i for i, m in enumerate( | |
| # ["january", "february", "march", "april", "may", "june", | |
| # "july", "august", "september", "october", "november", "december"], 1)} | |
| # ql = q.lower() | |
| # mon = next((v for k, v in month_map.items() if k in ql), None) | |
| # ym = re.search(r"(19|20)\d{2}", q) | |
| # yr = int(ym.group()) if ym else None | |
| # return mon, yr | |
| # def extract_topic_match(query, df): | |
| # query_lower = query.lower() | |
| # return df[ | |
| # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower) | |
| # ] | |
| # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") | |
| # st.title("Illinois Legislative Trends Q&A") | |
| # st.markdown("Ask about trends in topics like higher education, funding, etc.") | |
| # df = load_data() | |
| # embed_model, summarizer = load_models() | |
| # query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):") | |
| # if query: | |
| # mon, yr = extract_month_year(query) | |
| # df2 = extract_topic_match(query, df) | |
| # if df2.empty: | |
| # df2 = df | |
| # if yr: | |
| # df2 = df2[df2['status_date'].dt.year == yr] | |
| # if mon: | |
| # df2 = df2[df2['status_date'].dt.month == mon] | |
| # st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") | |
| # else: | |
| # st.info(f" Filtering by year: **{yr}**") | |
| # if df2.empty: | |
| # st.warning("No matching records found.") | |
| # else: | |
| # texts = df2['summary_insight'].tolist() | |
| # embs = compute_embeddings(texts, _model=embed_model) | |
| # res = semantic_search(query, embs, embed_model, threshold=0.5) | |
| # if not res: | |
| # st.warning("No relevant insights found.") | |
| # else: | |
| # st.subheader(" Top Matching Insights") | |
| # collected = [] | |
| # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]: | |
| # row = df2.iloc[idx] | |
| # date = row['status_date'].date() | |
| # bill_number = row['bill_number'] | |
| # full_url = row['url'] | |
| # cat = row['Category & Subcategory'] | |
| # cat_std = row['category_&_subcategory_standardized2'] | |
| # bene= row['Intended Beneficiaries'] | |
| # bene_std= row['intended_beneficiaries_standardized2'] | |
| # goal = row['Legislative Goal'] | |
| # impact = row['Policy Impact Areas'] | |
| # provision = row['Key Provisions'] | |
| # intent = row['Intent'] | |
| # stance = row['Stance'] | |
| # description = row['description'] | |
| # summary = row['summary'] | |
| # trend = clean_text(row['llama_trend_summary']) | |
| # insight = clean_text(row['llama_insight']) | |
| # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") | |
| # st.markdown(f"**Category:** {cat}") | |
| # st.markdown(f"**Category Std:** {cat_std}") | |
| # st.markdown(f"** Intended Beneficiaries:** {bene}") | |
| # st.markdown(f"** Intended Beneficiaries STD:** {bene_std}") | |
| # st.markdown(f"**Goal:** {goal}") | |
| # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") | |
| # st.markdown(f"**Policy Impacy Area:** {impact}") | |
| # st.markdown(f"**Key Provision:** {provision}") | |
| # st.markdown(f"**Description:** {description}") | |
| # st.markdown(f"**Summary:** {summary}") | |
| # st.markdown(f"Trend Summary:{trend}") | |
| # st.markdown(f"Actionable Insight:{insight}") | |
| # st.markdown(f"[View Full Bill Text]({full_url})\n") | |
| # st.divider() | |
| # collected.append(row['summary_insight']) | |
| # st.subheader("RAG-Generated Overall Summary") | |
| # summary = rag_summarize(collected, summarizer) | |
| # st.success(summary) | |
| # import streamlit as st | |
| # import pandas as pd | |
| # import re | |
| # from sentence_transformers import SentenceTransformer | |
| # from transformers import pipeline | |
| # from sklearn.metrics.pairwise import cosine_similarity | |
| # from sklearn.feature_extraction.text import TfidfVectorizer | |
| # from datetime import datetime | |
| # def clean_text(text): | |
| # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) | |
| # text = re.sub(r"(?i)let me know if you'd like.*", "", text) | |
| # text = re.sub(r"(?i)trend summary[:]*", "", text) | |
| # text = re.sub(r"(?i)actionable insight[:]*", "", text) | |
| # return text.strip() | |
| # @st.cache_data | |
| # def load_data(): | |
| # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv") | |
| # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') | |
| # df = df.dropna(subset=['status_date']) | |
| # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("") | |
| # df["llama_insight"] = df["llama_insight"].fillna("") | |
| # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"] | |
| # return df | |
| # @st.cache_resource | |
| # def load_models(): | |
| # embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") | |
| # return embed_model, summarizer | |
| # @st.cache_data | |
| # def compute_embeddings(texts, _model): | |
| # return _model.encode(texts, show_progress_bar=True) | |
| # def semantic_search(query, embeddings, model, threshold=0.5): | |
| # query_embedding = model.encode([query]) | |
| # sims = cosine_similarity(query_embedding, embeddings)[0] | |
| # return [(i, s) for i, s in enumerate(sims) if s > threshold] | |
| # def rag_summarize(texts, summarizer, top_k=10): # increased from 5 to 10 | |
| # if not texts: | |
| # return "No relevant content to summarize." | |
| # vect = TfidfVectorizer() | |
| # m = vect.fit_transform(texts) | |
| # mean_vec = m.mean(axis=0).A | |
| # scores = cosine_similarity(mean_vec, m).flatten() | |
| # top_indices = scores.argsort()[::-1][:top_k] | |
| # ctx = "\n".join(texts[i] for i in top_indices) | |
| # prompt = "summarize: " + ctx[:1024] | |
| # out = summarizer(prompt, max_length=150, min_length=80, do_sample=False) # updated length | |
| # return out[0]['summary_text'] | |
| # def extract_month_year(q): | |
| # month_map = {m: i for i, m in enumerate( | |
| # ["january", "february", "march", "april", "may", "june", | |
| # "july", "august", "september", "october", "november", "december"], 1)} | |
| # ql = q.lower() | |
| # mon = next((v for k, v in month_map.items() if k in ql), None) | |
| # ym = re.search(r"(19|20)\d{2}", q) | |
| # yr = int(ym.group()) if ym else None | |
| # return mon, yr | |
| # def extract_topic_match(query, df): | |
| # query_lower = query.lower() | |
| # return df[ | |
| # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower) | |
| # ] | |
| # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") | |
| # st.title("Illinois Legislative Trends Q&A") | |
| # st.markdown("Ask about trends in topics like higher education, funding, etc.") | |
| # df = load_data() | |
| # embed_model, summarizer = load_models() | |
| # query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):") | |
| # if query: | |
| # mon, yr = extract_month_year(query) | |
| # df2 = extract_topic_match(query, df) | |
| # if df2.empty: | |
| # df2 = df | |
| # if yr: | |
| # df2 = df2[df2['status_date'].dt.year == yr] | |
| # if mon: | |
| # df2 = df2[df2['status_date'].dt.month == mon] | |
| # st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") | |
| # else: | |
| # st.info(f" Filtering by year: **{yr}**") | |
| # if df2.empty: | |
| # st.warning("No matching records found.") | |
| # else: | |
| # texts = df2['summary_insight'].tolist() | |
| # embs = compute_embeddings(texts, _model=embed_model) | |
| # res = semantic_search(query, embs, embed_model, threshold=0.5) | |
| # if not res: | |
| # st.warning("No relevant insights found.") | |
| # else: | |
| # st.subheader("Top Matching Insights") | |
| # collected = [] | |
| # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: # increased to 10 | |
| # row = df2.iloc[idx] | |
| # date = row['status_date'].date() | |
| # bill_number = row['bill_number'] | |
| # full_url = row['url'] | |
| # cat = row['Category & Subcategory'] | |
| # cat_std = row['category_&_subcategory_standardized2'] | |
| # bene= row['Intended Beneficiaries'] | |
| # bene_std= row['intended_beneficiaries_standardized2'] | |
| # goal = row['Legislative Goal'] | |
| # impact = row['Policy Impact Areas'] | |
| # provision = row['Key Provisions'] | |
| # intent = row['Intent'] | |
| # stance = row['Stance'] | |
| # description = row['description'] | |
| # summary = row['summary'] | |
| # trend = clean_text(row['llama_trend_summary']) | |
| # insight = clean_text(row['llama_insight']) | |
| # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") | |
| # st.markdown(f"**Category:** {cat}") | |
| # # st.markdown(f"**Category Std:** {cat_std}") | |
| # st.markdown(f"**Intended Beneficiaries:** {bene}") | |
| # # st.markdown(f"**Intended Beneficiaries STD:** {bene_std}") | |
| # st.markdown(f"**Goal:** {goal}") | |
| # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") | |
| # st.markdown(f"**Policy Impact Area:** {impact}") | |
| # st.markdown(f"**Key Provision:** {provision}") | |
| # st.markdown(f"**Description:** {description}") | |
| # # st.markdown(f"**Summary:** {summary}") | |
| # st.markdown(f"**Trend Summary:** {trend}") | |
| # st.markdown(f"**Actionable Insight:** {insight}") | |
| # st.markdown(f"[View Full Bill Text]({full_url})\n") | |
| # st.divider() | |
| # collected.append(row['summary_insight']) | |
| # st.subheader("RAG-Generated Overall Summary") | |
| # summary = rag_summarize(collected, summarizer) | |
| # st.success(summary) | |
| # | |
| # including description | |
| # import streamlit as st | |
| # import pandas as pd | |
| # import re | |
| # from sentence_transformers import SentenceTransformer | |
| # from transformers import pipeline | |
| # from sklearn.metrics.pairwise import cosine_similarity | |
| # from sklearn.feature_extraction.text import TfidfVectorizer | |
| # from datetime import datetime | |
| # def clean_text(text): | |
| # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) | |
| # text = re.sub(r"(?i)let me know if you'd like.*", "", text) | |
| # text = re.sub(r"(?i)trend summary[:]*", "", text) | |
| # text = re.sub(r"(?i)actionable insight[:]*", "", text) | |
| # return text.strip() | |
| # @st.cache_data | |
| # def load_data(): | |
| # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv") | |
| # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') | |
| # df = df.dropna(subset=['status_date']) | |
| # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("") | |
| # df["llama_insight"] = df["llama_insight"].fillna("") | |
| # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"] | |
| # return df | |
| # @st.cache_resource | |
| # def load_models(): | |
| # embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") | |
| # return embed_model, summarizer | |
| # @st.cache_data | |
| # def compute_embeddings(texts, _model): | |
| # return _model.encode(texts, show_progress_bar=True) | |
| # def semantic_search(query, embeddings, model, threshold=0.5): | |
| # query_embedding = model.encode([query]) | |
| # sims = cosine_similarity(query_embedding, embeddings)[0] | |
| # return [(i, s) for i, s in enumerate(sims) if s > threshold] | |
| # def rag_summarize(texts, summarizer, top_k=10): | |
| # if not texts: | |
| # return "No relevant content to summarize." | |
| # vect = TfidfVectorizer() | |
| # m = vect.fit_transform(texts) | |
| # mean_vec = m.mean(axis=0).A | |
| # scores = cosine_similarity(mean_vec, m).flatten() | |
| # top_indices = scores.argsort()[::-1][:top_k] | |
| # ctx = "\n".join(texts[i] for i in top_indices) | |
| # prompt = "summarize: " + ctx[:1024] | |
| # out = summarizer(prompt, max_length=200, min_length=80, do_sample=False) | |
| # return out[0]['summary_text'] | |
| # def extract_month_year(q): | |
| # month_map = {m: i for i, m in enumerate( | |
| # ["january", "february", "march", "april", "may", "june", | |
| # "july", "august", "september", "october", "november", "december"], 1)} | |
| # ql = q.lower() | |
| # mon = next((v for k, v in month_map.items() if k in ql), None) | |
| # ym = re.search(r"(19|20)\d{2}", q) | |
| # yr = int(ym.group()) if ym else None | |
| # return mon, yr | |
| # def extract_date_range(query): | |
| # month_map = { | |
| # "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, | |
| # "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12 | |
| # } | |
| # patterns = [ | |
| # r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})", | |
| # ] | |
| # for pattern in patterns: | |
| # match = re.search(pattern, query) | |
| # if match: | |
| # start_month_str, start_year = match.group(1).lower(), int(match.group(2)) | |
| # end_month_str, end_year = match.group(3).lower(), int(match.group(4)) | |
| # start_month = month_map.get(start_month_str) | |
| # end_month = month_map.get(end_month_str) | |
| # if start_month and end_month: | |
| # start_date = datetime(start_year, start_month, 1) | |
| # end_date = datetime(end_year, end_month, 28) | |
| # return start_date, end_date | |
| # return None, None | |
| # def extract_topic_match(query, df): | |
| # query_lower = query.lower() | |
| # return df[ | |
| # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower) | |
| # ] | |
| # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") | |
| # st.title("Illinois Legislative Trends Q&A") | |
| # st.markdown("Ask about trends in topics like higher education, funding, etc.") | |
| # df = load_data() | |
| # embed_model, summarizer = load_models() | |
| # query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):") | |
| # if query: | |
| # start_date, end_date = extract_date_range(query) | |
| # df2 = extract_topic_match(query, df) | |
| # if df2.empty: | |
| # df2 = df | |
| # if start_date and end_date: | |
| # df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)] | |
| # st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**") | |
| # else: | |
| # mon, yr = extract_month_year(query) | |
| # if yr: | |
| # df2 = df2[df2['status_date'].dt.year == yr] | |
| # if mon: | |
| # df2 = df2[df2['status_date'].dt.month == mon] | |
| # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") | |
| # else: | |
| # st.info(f"Filtering by year: **{yr}**") | |
| # if df2.empty: | |
| # st.warning("No matching records found.") | |
| # else: | |
| # # Include description in embeddings + RAG | |
| # texts = (df2['description'].fillna('') + "\n" + df2['summary_insight'].fillna('')).tolist() | |
| # embs = compute_embeddings(texts, _model=embed_model) | |
| # res = semantic_search(query, embs, embed_model, threshold=0.5) | |
| # if not res: | |
| # st.warning("No relevant insights found.") | |
| # else: | |
| # st.subheader("Top Matching Insights") | |
| # collected = [] | |
| # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: | |
| # row = df2.iloc[idx] | |
| # date = row['status_date'].date() | |
| # bill_number = row['bill_number'] | |
| # full_url = row['url'] | |
| # cat = row['Category & Subcategory'] | |
| # cat_std = row['category_&_subcategory_standardized2'] | |
| # bene = row['Intended Beneficiaries'] | |
| # bene_std = row['intended_beneficiaries_standardized2'] | |
| # goal = row['Legislative Goal'] | |
| # impact = row['Policy Impact Areas'] | |
| # provision = row['Key Provisions'] | |
| # intent = row['Intent'] | |
| # stance = row['Stance'] | |
| # description = row['description'] | |
| # summary = row['summary'] | |
| # trend = clean_text(row['llama_trend_summary']) | |
| # insight = clean_text(row['llama_insight']) | |
| # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") | |
| # st.markdown(f"**Category:** {cat}") | |
| # st.markdown(f"**Intended Beneficiaries:** {bene}") | |
| # st.markdown(f"**Goal:** {goal}") | |
| # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") | |
| # st.markdown(f"**Policy Impact Area:** {impact}") | |
| # st.markdown(f"**Key Provision:** {provision}") | |
| # st.markdown(f"**Description:** {description}") | |
| # st.markdown(f"**Trend Summary:** {trend}") | |
| # st.markdown(f"**Actionable Insight:** {insight}") | |
| # st.markdown(f"[View Full Bill Text]({full_url})\n") | |
| # st.divider() | |
| # collected.append(description + "\n" + row['summary_insight']) | |
| # st.subheader("RAG-Generated Overall Summary") | |
| # summary = rag_summarize(collected, summarizer) | |
| # st.success(summary) | |
| ## NEW ONE | |
| # import streamlit as st | |
| # import pandas as pd | |
| # import re | |
| # from sentence_transformers import SentenceTransformer | |
| # from transformers import pipeline | |
| # from sklearn.metrics.pairwise import cosine_similarity | |
| # from sklearn.feature_extraction.text import TfidfVectorizer | |
| # from datetime import datetime | |
| # def clean_text(text): | |
| # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) | |
| # text = re.sub(r"(?i)let me know if you'd like.*", "", text) | |
| # text = re.sub(r"(?i)trend summary[:]*", "", text) | |
| # text = re.sub(r"(?i)actionable insight[:]*", "", text) | |
| # return text.strip() | |
| # @st.cache_data | |
| # def load_data(): | |
| # df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv") | |
| # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') | |
| # df = df.dropna(subset=['status_date']) | |
| # for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions", | |
| # "Intended Beneficiaries", "Potential Impact", "description"]: | |
| # df[col] = df[col].fillna("") | |
| # df["combined_text"] = ( | |
| # "Legislative Goal: " + df["Legislative Goal"] + "\n" + | |
| # "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" + | |
| # "Key Provisions: " + df["Key Provisions"] + "\n" + | |
| # "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" + | |
| # "Potential Impact: " + df["Potential Impact"] + "\n" + | |
| # "Description: " + df["description"] | |
| # ) | |
| # return df | |
| # @st.cache_resource | |
| # def load_models(): | |
| # embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") | |
| # return embed_model, summarizer | |
| # @st.cache_data | |
| # def compute_embeddings(texts, _model): | |
| # return _model.encode(texts, show_progress_bar=True) | |
| # def semantic_search(query, embeddings, model, threshold=0.5): | |
| # query_embedding = model.encode([query]) | |
| # sims = cosine_similarity(query_embedding, embeddings)[0] | |
| # return [(i, s) for i, s in enumerate(sims) if s > threshold] | |
| # def rag_summarize(texts, summarizer, top_k=5): | |
| # if not texts: | |
| # return "No relevant content to summarize." | |
| # vect = TfidfVectorizer() | |
| # m = vect.fit_transform(texts) | |
| # mean_vec = m.mean(axis=0).A | |
| # scores = cosine_similarity(mean_vec, m).flatten() | |
| # top_indices = scores.argsort()[::-1][:top_k] | |
| # ctx = "\n".join(texts[i] for i in top_indices) | |
| # prompt = "summarize: " + ctx[:1024] | |
| # out = summarizer(prompt, max_length=200, min_length=80, do_sample=False) | |
| # return out[0]['summary_text'] | |
| # def extract_month_year(q): | |
| # month_map = {m: i for i, m in enumerate( | |
| # ["january", "february", "march", "april", "may", "june", | |
| # "july", "august", "september", "october", "november", "december"], 1)} | |
| # ql = q.lower() | |
| # mon = next((v for k, v in month_map.items() if k in ql), None) | |
| # ym = re.search(r"(19|20)\d{2}", q) | |
| # yr = int(ym.group()) if ym else None | |
| # return mon, yr | |
| # def extract_date_range(query): | |
| # month_map = { | |
| # "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, | |
| # "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12 | |
| # } | |
| # patterns = [ | |
| # r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})", | |
| # ] | |
| # for pattern in patterns: | |
| # match = re.search(pattern, query) | |
| # if match: | |
| # start_month_str, start_year = match.group(1).lower(), int(match.group(2)) | |
| # end_month_str, end_year = match.group(3).lower(), int(match.group(4)) | |
| # start_month = month_map.get(start_month_str) | |
| # end_month = month_map.get(end_month_str) | |
| # if start_month and end_month: | |
| # start_date = datetime(start_year, start_month, 1) | |
| # end_date = datetime(end_year, end_month, 28) | |
| # return start_date, end_date | |
| # return None, None | |
| # def extract_topic_match(query, df): | |
| # query_lower = query.lower() | |
| # return df[ | |
| # df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['Intent'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) | | |
| # df['Potential Impact'].fillna('').str.lower().str.contains(query_lower) | |
| # ] | |
| # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") | |
| # st.title("Illinois Legislative Trends Q&A") | |
| # st.markdown("Ask about trends in topics like higher education, funding, etc.") | |
| # df = load_data() | |
| # embed_model, summarizer = load_models() | |
| # query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):") | |
| # if query: | |
| # start_date, end_date = extract_date_range(query) | |
| # df2 = extract_topic_match(query, df) | |
| # if df2.empty: | |
| # df2 = df | |
| # if start_date and end_date: | |
| # df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)] | |
| # st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**") | |
| # else: | |
| # mon, yr = extract_month_year(query) | |
| # if yr: | |
| # df2 = df2[df2['status_date'].dt.year == yr] | |
| # if mon: | |
| # df2 = df2[df2['status_date'].dt.month == mon] | |
| # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") | |
| # else: | |
| # st.info(f"Filtering by year: **{yr}**") | |
| # if df2.empty: | |
| # st.warning("No matching records found.") | |
| # else: | |
| # texts = df2['combined_text'].tolist() | |
| # embs = compute_embeddings(texts, _model=embed_model) | |
| # res = semantic_search(query, embs, embed_model, threshold=0.5) | |
| # if not res: | |
| # st.warning("No relevant insights found.") | |
| # else: | |
| # st.subheader("Top Matching Insights") | |
| # collected = [] | |
| # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: | |
| # row = df2.iloc[idx] | |
| # date = row['status_date'].date() | |
| # bill_number = row['bill_number'] | |
| # full_url = row['url'] | |
| # cat = row.get('Category & Subcategory', '') | |
| # bene = row.get('Intended Beneficiaries', '') | |
| # goal = row.get('Legislative Goal', '') | |
| # impact = row.get('Policy Impact Areas', '') | |
| # provision = row.get('Key Provisions', '') | |
| # intent = row.get('Intent', '') | |
| # stance = row.get('Stance', '') | |
| # description = row.get('description', '') | |
| # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") | |
| # st.markdown(f"**Category:** {cat}") | |
| # st.markdown(f"**Intended Beneficiaries:** {bene}") | |
| # st.markdown(f"**Goal:** {goal}") | |
| # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") | |
| # st.markdown(f"**Policy Impact Area:** {impact}") | |
| # st.markdown(f"**Key Provision:** {provision}") | |
| # st.markdown(f"**Description:** {description}") | |
| # st.markdown(f"[View Full Bill Text]({full_url})\n") | |
| # st.divider() | |
| # collected.append(row['combined_text']) | |
| # st.subheader("RAG-Generated Overall Summary") | |
| # summary = rag_summarize(collected, summarizer) | |
| # st.success(summary) | |
| #BART | |
| import streamlit as st | |
| import pandas as pd | |
| import re | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import pipeline | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from datetime import datetime | |
| def clean_text(text): | |
| text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text) | |
| text = re.sub(r"(?i)let me know if you'd like.*", "", text) | |
| text = re.sub(r"(?i)trend summary[:]*", "", text) | |
| text = re.sub(r"(?i)actionable insight[:]*", "", text) | |
| return text.strip() | |
| def load_data(): | |
| df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv") | |
| df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') | |
| df = df.dropna(subset=['status_date']) | |
| for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions", | |
| "Intended Beneficiaries", "Potential Impact", "description"]: | |
| df[col] = df[col].fillna("") | |
| df["combined_text"] = ( | |
| "Legislative Goal: " + df["Legislative Goal"] + "\n" + | |
| "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" + | |
| "Key Provisions: " + df["Key Provisions"] + "\n" + | |
| "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" + | |
| "Potential Impact: " + df["Potential Impact"] + "\n" + | |
| "Description: " + df["description"] | |
| ) | |
| return df | |
| def load_models(): | |
| embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Changed summarization model to facebook/bart-large-cnn for better summary quality | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn") | |
| return embed_model, summarizer | |
| def compute_embeddings(texts, _model): | |
| return _model.encode(texts, show_progress_bar=True) | |
| def semantic_search(query, embeddings, model, threshold=0.5): | |
| query_embedding = model.encode([query]) | |
| sims = cosine_similarity(query_embedding, embeddings)[0] | |
| return [(i, s) for i, s in enumerate(sims) if s > threshold] | |
| def rag_summarize(texts, summarizer, top_k=5): | |
| if not texts: | |
| return "No relevant content to summarize." | |
| vect = TfidfVectorizer() | |
| m = vect.fit_transform(texts) | |
| mean_vec = m.mean(axis=0).A | |
| scores = cosine_similarity(mean_vec, m).flatten() | |
| top_indices = scores.argsort()[::-1][:top_k] | |
| ctx = "\n".join(texts[i] for i in top_indices) | |
| prompt = "summarize: " + ctx[:1024] | |
| out = summarizer(prompt, max_length=250, min_length=80, do_sample=False) | |
| return out[0]['summary_text'] | |
| def extract_month_year(q): | |
| month_map = {m: i for i, m in enumerate( | |
| ["january", "february", "march", "april", "may", "june", | |
| "july", "august", "september", "october", "november", "december"], 1)} | |
| ql = q.lower() | |
| mon = next((v for k, v in month_map.items() if k in ql), None) | |
| ym = re.search(r"(19|20)\d{2}", q) | |
| yr = int(ym.group()) if ym else None | |
| return mon, yr | |
| def extract_date_range(query): | |
| month_map = { | |
| "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, | |
| "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12 | |
| } | |
| patterns = [ | |
| r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})", | |
| ] | |
| for pattern in patterns: | |
| match = re.search(pattern, query) | |
| if match: | |
| start_month_str, start_year = match.group(1).lower(), int(match.group(2)) | |
| end_month_str, end_year = match.group(3).lower(), int(match.group(4)) | |
| start_month = month_map.get(start_month_str) | |
| end_month = month_map.get(end_month_str) | |
| if start_month and end_month: | |
| start_date = datetime(start_year, start_month, 1) | |
| end_date = datetime(end_year, end_month, 28) | |
| return start_date, end_date | |
| return None, None | |
| def extract_topic_match(query, df): | |
| query_lower = query.lower() | |
| return df[ | |
| df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) | | |
| df['Intent'].fillna('').str.lower().str.contains(query_lower) | | |
| df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) | | |
| df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) | | |
| df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) | | |
| df['Potential Impact'].fillna('').str.lower().str.contains(query_lower) | |
| ] | |
| st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide") | |
| st.title("Illinois Legislative Trends Q&A") | |
| st.markdown("Ask about trends in topics like higher education, funding, etc.") | |
| df = load_data() | |
| embed_model, summarizer = load_models() | |
| query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):") | |
| if query: | |
| start_date, end_date = extract_date_range(query) | |
| df2 = extract_topic_match(query, df) | |
| if df2.empty: | |
| df2 = df | |
| if start_date and end_date: | |
| df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)] | |
| st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**") | |
| else: | |
| mon, yr = extract_month_year(query) | |
| if yr: | |
| df2 = df2[df2['status_date'].dt.year == yr] | |
| if mon: | |
| df2 = df2[df2['status_date'].dt.month == mon] | |
| st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**") | |
| else: | |
| st.info(f"Filtering by year: **{yr}**") | |
| if df2.empty: | |
| st.warning("No matching records found.") | |
| else: | |
| texts = df2['combined_text'].tolist() | |
| embs = compute_embeddings(texts, _model=embed_model) | |
| res = semantic_search(query, embs, embed_model, threshold=0.5) | |
| if not res: | |
| st.warning("No relevant insights found.") | |
| else: | |
| st.subheader("Top Matching Insights") | |
| collected = [] | |
| for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: | |
| row = df2.iloc[idx] | |
| date = row['status_date'].date() | |
| bill_number = row['bill_number'] | |
| full_url = row['url'] | |
| cat = row.get('Category & Subcategory', '') | |
| bene = row.get('Intended Beneficiaries', '') | |
| goal = row.get('Legislative Goal', '') | |
| impact = row.get('Policy Impact Areas', '') | |
| provision = row.get('Key Provisions', '') | |
| intent = row.get('Intent', '') | |
| stance = row.get('Stance', '') | |
| description = row.get('description', '') | |
| st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}") | |
| st.markdown(f"**Category:** {cat}") | |
| st.markdown(f"**Intended Beneficiaries:** {bene}") | |
| st.markdown(f"**Goal:** {goal}") | |
| st.markdown(f"**Intent:** {intent} | **Stance:** {stance}") | |
| st.markdown(f"**Policy Impact Area:** {impact}") | |
| st.markdown(f"**Key Provision:** {provision}") | |
| st.markdown(f"**Description:** {description}") | |
| st.markdown(f"[View Full Bill Text]({full_url})\n") | |
| st.divider() | |
| collected.append(row['combined_text']) | |
| st.subheader("RAG-Generated Overall Summary") | |
| summary = rag_summarize(collected, summarizer) | |
| st.success(summary) | |