# import streamlit as st
# import pandas as pd
# import re
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer
# from datetime import datetime

# def clean_text(text):
#     text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
#     text = re.sub(r"(?i)let me know if you'd like.*", "", text)
#     text = re.sub(r"(?i)trend summary[:]*", "", text)
#     text = re.sub(r"(?i)actionable insight[:]*", "", text)
#     return text.strip()

# @st.cache_data
# def load_data():
#     df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
#     df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
#     df = df.dropna(subset=['status_date'])
#     df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
#     df["llama_insight"] = df["llama_insight"].fillna("")
#     df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
#     return df

# @st.cache_resource
# def load_models():
#     embed_model = SentenceTransformer('all-MiniLM-L6-v2')
#     summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
#     return embed_model, summarizer

# @st.cache_data
# def compute_embeddings(texts, _model):
#     return _model.encode(texts, show_progress_bar=True)

# def semantic_search(query, embeddings, model, threshold=0.5):  
#     query_embedding = model.encode([query])
#     sims = cosine_similarity(query_embedding, embeddings)[0]
#     return [(i, s) for i, s in enumerate(sims) if s > threshold]

# def rag_summarize(texts, summarizer, top_k=5):
#     if not texts:
#         return "No relevant content to summarize."
#     vect = TfidfVectorizer()
#     m = vect.fit_transform(texts)
#     mean_vec = m.mean(axis=0).A
#     scores = cosine_similarity(mean_vec, m).flatten()
#     top_indices = scores.argsort()[::-1][:top_k]
#     ctx = "\n".join(texts[i] for i in top_indices)
#     prompt = "summarize: " + ctx[:1024]
#     out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
#     return out[0]['summary_text']

# def extract_month_year(q):
#     month_map = {m: i for i, m in enumerate(
#         ["january", "february", "march", "april", "may", "june",
#          "july", "august", "september", "october", "november", "december"], 1)}
#     ql = q.lower()
#     mon = next((v for k, v in month_map.items() if k in ql), None)
#     ym = re.search(r"(19|20)\d{2}", q)
#     yr = int(ym.group()) if ym else None
#     return mon, yr

# def extract_topic_match(query, df):
#     query_lower = query.lower()
#     return df[
#         df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
#         df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
#         df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
#         df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
#     ]

# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
# st.title("Illinois Legislative Trends Q&A")
# st.markdown("Ask about trends in topics like higher education, funding, etc.")

# df = load_data()
# embed_model, summarizer = load_models()

# query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")

# if query:
#     mon, yr = extract_month_year(query)
#     df2 = extract_topic_match(query, df)

#     if df2.empty:
#         df2 = df
#     if yr:
#         df2 = df2[df2['status_date'].dt.year == yr]
#         if mon:
#             df2 = df2[df2['status_date'].dt.month == mon]
#             st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
#         else:
#             st.info(f" Filtering by year: **{yr}**")

#     if df2.empty:
#         st.warning("No matching records found.")
#     else:
#         texts = df2['summary_insight'].tolist()
#         embs = compute_embeddings(texts, _model=embed_model)
#         res = semantic_search(query, embs, embed_model, threshold=0.5)

#         if not res:
#             st.warning("No relevant insights found.")
#         else:
#             st.subheader(" Top Matching Insights")
#             collected = []

#             for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
#                 row = df2.iloc[idx]
#                 date = row['status_date'].date()
#                 bill_number = row['bill_number']
#                 full_url = row['url']
#                 cat = row['Category & Subcategory']
#                 cat_std = row['category_&_subcategory_standardized2']
#                 bene= row['Intended Beneficiaries']
#                 bene_std= row['intended_beneficiaries_standardized2']
#                 goal = row['Legislative Goal']
#                 impact = row['Policy Impact Areas']
#                 provision = row['Key Provisions']
#                 intent = row['Intent']
#                 stance = row['Stance']
#                 description = row['description']
#                 summary = row['summary']

#                 trend = clean_text(row['llama_trend_summary'])
#                 insight = clean_text(row['llama_insight'])

#                 st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
#                 st.markdown(f"**Category:** {cat}")
#                 st.markdown(f"**Category Std:** {cat_std}")
#                 st.markdown(f"** Intended Beneficiaries:** {bene}")
#                 st.markdown(f"** Intended Beneficiaries STD:** {bene_std}")
#                 st.markdown(f"**Goal:** {goal}")
#                 st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
#                 st.markdown(f"**Policy Impacy Area:** {impact}")
#                 st.markdown(f"**Key Provision:** {provision}")
#                 st.markdown(f"**Description:** {description}")
#                 st.markdown(f"**Summary:** {summary}")
#                 st.markdown(f"Trend Summary:{trend}")
#                 st.markdown(f"Actionable Insight:{insight}")
#                 st.markdown(f"[View Full Bill Text]({full_url})\n")
#                 st.divider()

#                 collected.append(row['summary_insight'])

#             st.subheader("RAG-Generated Overall Summary")
#             summary = rag_summarize(collected, summarizer)
#             st.success(summary)
# import streamlit as st
# import pandas as pd
# import re
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer
# from datetime import datetime

# def clean_text(text):
#     text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
#     text = re.sub(r"(?i)let me know if you'd like.*", "", text)
#     text = re.sub(r"(?i)trend summary[:]*", "", text)
#     text = re.sub(r"(?i)actionable insight[:]*", "", text)
#     return text.strip()

# @st.cache_data
# def load_data():
#     df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
#     df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
#     df = df.dropna(subset=['status_date'])
#     df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
#     df["llama_insight"] = df["llama_insight"].fillna("")
#     df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
#     return df

# @st.cache_resource
# def load_models():
#     embed_model = SentenceTransformer('all-MiniLM-L6-v2')
#     summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
#     return embed_model, summarizer

# @st.cache_data
# def compute_embeddings(texts, _model):
#     return _model.encode(texts, show_progress_bar=True)

# def semantic_search(query, embeddings, model, threshold=0.5):  
#     query_embedding = model.encode([query])
#     sims = cosine_similarity(query_embedding, embeddings)[0]
#     return [(i, s) for i, s in enumerate(sims) if s > threshold]

# def rag_summarize(texts, summarizer, top_k=10):  # increased from 5 to 10
#     if not texts:
#         return "No relevant content to summarize."
#     vect = TfidfVectorizer()
#     m = vect.fit_transform(texts)
#     mean_vec = m.mean(axis=0).A
#     scores = cosine_similarity(mean_vec, m).flatten()
#     top_indices = scores.argsort()[::-1][:top_k]
#     ctx = "\n".join(texts[i] for i in top_indices)
#     prompt = "summarize: " + ctx[:1024]
#     out = summarizer(prompt, max_length=150, min_length=80, do_sample=False)  # updated length
#     return out[0]['summary_text']

# def extract_month_year(q):
#     month_map = {m: i for i, m in enumerate(
#         ["january", "february", "march", "april", "may", "june",
#          "july", "august", "september", "october", "november", "december"], 1)}
#     ql = q.lower()
#     mon = next((v for k, v in month_map.items() if k in ql), None)
#     ym = re.search(r"(19|20)\d{2}", q)
#     yr = int(ym.group()) if ym else None
#     return mon, yr

# def extract_topic_match(query, df):
#     query_lower = query.lower()
#     return df[
#         df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
#         df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
#         df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
#         df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
#     ]

# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
# st.title("Illinois Legislative Trends Q&A")
# st.markdown("Ask about trends in topics like higher education, funding, etc.")

# df = load_data()
# embed_model, summarizer = load_models()

# query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")

# if query:
#     mon, yr = extract_month_year(query)
#     df2 = extract_topic_match(query, df)

#     if df2.empty:
#         df2 = df
#     if yr:
#         df2 = df2[df2['status_date'].dt.year == yr]
#         if mon:
#             df2 = df2[df2['status_date'].dt.month == mon]
#             st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
#         else:
#             st.info(f" Filtering by year: **{yr}**")

#     if df2.empty:
#         st.warning("No matching records found.")
#     else:
#         texts = df2['summary_insight'].tolist()
#         embs = compute_embeddings(texts, _model=embed_model)
#         res = semantic_search(query, embs, embed_model, threshold=0.5)

#         if not res:
#             st.warning("No relevant insights found.")
#         else:
#             st.subheader("Top Matching Insights")
#             collected = []

#             for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:  # increased to 10
#                 row = df2.iloc[idx]
#                 date = row['status_date'].date()
#                 bill_number = row['bill_number']
#                 full_url = row['url']
#                 cat = row['Category & Subcategory']
#                 cat_std = row['category_&_subcategory_standardized2']
#                 bene= row['Intended Beneficiaries']
#                 bene_std= row['intended_beneficiaries_standardized2']
#                 goal = row['Legislative Goal']
#                 impact = row['Policy Impact Areas']
#                 provision = row['Key Provisions']
#                 intent = row['Intent']
#                 stance = row['Stance']
#                 description = row['description']
#                 summary = row['summary']

#                 trend = clean_text(row['llama_trend_summary'])
#                 insight = clean_text(row['llama_insight'])

#                 st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
#                 st.markdown(f"**Category:** {cat}")
#                 # st.markdown(f"**Category Std:** {cat_std}")
#                 st.markdown(f"**Intended Beneficiaries:** {bene}")
#                 # st.markdown(f"**Intended Beneficiaries STD:** {bene_std}")
#                 st.markdown(f"**Goal:** {goal}")
#                 st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
#                 st.markdown(f"**Policy Impact Area:** {impact}")
#                 st.markdown(f"**Key Provision:** {provision}")
#                 st.markdown(f"**Description:** {description}")
#                 # st.markdown(f"**Summary:** {summary}")
#                 st.markdown(f"**Trend Summary:** {trend}")
#                 st.markdown(f"**Actionable Insight:** {insight}")
#                 st.markdown(f"[View Full Bill Text]({full_url})\n")
#                 st.divider()

#                 collected.append(row['summary_insight'])

#             st.subheader("RAG-Generated Overall Summary")
#             summary = rag_summarize(collected, summarizer)
#             st.success(summary)

# 

# including description
# import streamlit as st
# import pandas as pd
# import re
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer
# from datetime import datetime

# def clean_text(text):
#     text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
#     text = re.sub(r"(?i)let me know if you'd like.*", "", text)
#     text = re.sub(r"(?i)trend summary[:]*", "", text)
#     text = re.sub(r"(?i)actionable insight[:]*", "", text)
#     return text.strip()

# @st.cache_data
# def load_data():
#     df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2_with_std2FV1.csv")
#     df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
#     df = df.dropna(subset=['status_date'])
#     df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
#     df["llama_insight"] = df["llama_insight"].fillna("")
#     df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
#     return df

# @st.cache_resource
# def load_models():
#     embed_model = SentenceTransformer('all-MiniLM-L6-v2')
#     summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
#     return embed_model, summarizer

# @st.cache_data
# def compute_embeddings(texts, _model):
#     return _model.encode(texts, show_progress_bar=True)

# def semantic_search(query, embeddings, model, threshold=0.5):  
#     query_embedding = model.encode([query])
#     sims = cosine_similarity(query_embedding, embeddings)[0]
#     return [(i, s) for i, s in enumerate(sims) if s > threshold]

# def rag_summarize(texts, summarizer, top_k=10):
#     if not texts:
#         return "No relevant content to summarize."
#     vect = TfidfVectorizer()
#     m = vect.fit_transform(texts)
#     mean_vec = m.mean(axis=0).A
#     scores = cosine_similarity(mean_vec, m).flatten()
#     top_indices = scores.argsort()[::-1][:top_k]
#     ctx = "\n".join(texts[i] for i in top_indices)
#     prompt = "summarize: " + ctx[:1024]
#     out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)  
#     return out[0]['summary_text']

# def extract_month_year(q):
#     month_map = {m: i for i, m in enumerate(
#         ["january", "february", "march", "april", "may", "june",
#          "july", "august", "september", "october", "november", "december"], 1)}
#     ql = q.lower()
#     mon = next((v for k, v in month_map.items() if k in ql), None)
#     ym = re.search(r"(19|20)\d{2}", q)
#     yr = int(ym.group()) if ym else None
#     return mon, yr

# def extract_date_range(query):
#     month_map = {
#         "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
#         "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
#     }

#     patterns = [
#         r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
#     ]

#     for pattern in patterns:
#         match = re.search(pattern, query)
#         if match:
#             start_month_str, start_year = match.group(1).lower(), int(match.group(2))
#             end_month_str, end_year = match.group(3).lower(), int(match.group(4))

#             start_month = month_map.get(start_month_str)
#             end_month = month_map.get(end_month_str)

#             if start_month and end_month:
#                 start_date = datetime(start_year, start_month, 1)
#                 end_date = datetime(end_year, end_month, 28)
#                 return start_date, end_date

#     return None, None

# def extract_topic_match(query, df):
#     query_lower = query.lower()
#     return df[
#         df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
#         df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
#         df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
#         df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
#     ]

# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
# st.title("Illinois Legislative Trends Q&A")
# st.markdown("Ask about trends in topics like higher education, funding, etc.")

# df = load_data()
# embed_model, summarizer = load_models()

# query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")

# if query:
#     start_date, end_date = extract_date_range(query)
#     df2 = extract_topic_match(query, df)

#     if df2.empty:
#         df2 = df

#     if start_date and end_date:
#         df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
#         st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
#     else:
#         mon, yr = extract_month_year(query)
#         if yr:
#             df2 = df2[df2['status_date'].dt.year == yr]
#             if mon:
#                 df2 = df2[df2['status_date'].dt.month == mon]
#                 st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
#             else:
#                 st.info(f"Filtering by year: **{yr}**")

#     if df2.empty:
#         st.warning("No matching records found.")
#     else:
#         # Include description in embeddings + RAG
#         texts = (df2['description'].fillna('') + "\n" + df2['summary_insight'].fillna('')).tolist()
#         embs = compute_embeddings(texts, _model=embed_model)
#         res = semantic_search(query, embs, embed_model, threshold=0.5)

#         if not res:
#             st.warning("No relevant insights found.")
#         else:
#             st.subheader("Top Matching Insights")
#             collected = []

#             for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: 
#                 row = df2.iloc[idx]
#                 date = row['status_date'].date()
#                 bill_number = row['bill_number']
#                 full_url = row['url']
#                 cat = row['Category & Subcategory']
#                 cat_std = row['category_&_subcategory_standardized2']
#                 bene = row['Intended Beneficiaries']
#                 bene_std = row['intended_beneficiaries_standardized2']
#                 goal = row['Legislative Goal']
#                 impact = row['Policy Impact Areas']
#                 provision = row['Key Provisions']
#                 intent = row['Intent']
#                 stance = row['Stance']
#                 description = row['description']
#                 summary = row['summary']
#                 trend = clean_text(row['llama_trend_summary'])
#                 insight = clean_text(row['llama_insight'])

#                 st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
#                 st.markdown(f"**Category:** {cat}")
#                 st.markdown(f"**Intended Beneficiaries:** {bene}")
#                 st.markdown(f"**Goal:** {goal}")
#                 st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
#                 st.markdown(f"**Policy Impact Area:** {impact}")
#                 st.markdown(f"**Key Provision:** {provision}")
#                 st.markdown(f"**Description:** {description}")
#                 st.markdown(f"**Trend Summary:** {trend}")
#                 st.markdown(f"**Actionable Insight:** {insight}")
#                 st.markdown(f"[View Full Bill Text]({full_url})\n")
#                 st.divider()

#                 collected.append(description + "\n" + row['summary_insight'])

#             st.subheader("RAG-Generated Overall Summary")
#             summary = rag_summarize(collected, summarizer)
#             st.success(summary)


## NEW ONE

# import streamlit as st
# import pandas as pd
# import re
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer
# from datetime import datetime

# def clean_text(text):
#     text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
#     text = re.sub(r"(?i)let me know if you'd like.*", "", text)
#     text = re.sub(r"(?i)trend summary[:]*", "", text)
#     text = re.sub(r"(?i)actionable insight[:]*", "", text)
#     return text.strip()

# @st.cache_data
# def load_data():
#     df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
#     df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
#     df = df.dropna(subset=['status_date'])

#     for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
#                 "Intended Beneficiaries", "Potential Impact", "description"]:
#         df[col] = df[col].fillna("")

#     df["combined_text"] = (
#         "Legislative Goal: " + df["Legislative Goal"] + "\n" +
#         "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
#         "Key Provisions: " + df["Key Provisions"] + "\n" +
#         "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
#         "Potential Impact: " + df["Potential Impact"] + "\n" +
#         "Description: " + df["description"]
#     )

#     return df

# @st.cache_resource
# def load_models():
#     embed_model = SentenceTransformer('all-MiniLM-L6-v2')
#     summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
#     return embed_model, summarizer

# @st.cache_data
# def compute_embeddings(texts, _model):
#     return _model.encode(texts, show_progress_bar=True)

# def semantic_search(query, embeddings, model, threshold=0.5):  
#     query_embedding = model.encode([query])
#     sims = cosine_similarity(query_embedding, embeddings)[0]
#     return [(i, s) for i, s in enumerate(sims) if s > threshold]

# def rag_summarize(texts, summarizer, top_k=5):
#     if not texts:
#         return "No relevant content to summarize."
#     vect = TfidfVectorizer()
#     m = vect.fit_transform(texts)
#     mean_vec = m.mean(axis=0).A
#     scores = cosine_similarity(mean_vec, m).flatten()
#     top_indices = scores.argsort()[::-1][:top_k]
#     ctx = "\n".join(texts[i] for i in top_indices)
#     prompt = "summarize: " + ctx[:1024]
#     out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)  
#     return out[0]['summary_text']

# def extract_month_year(q):
#     month_map = {m: i for i, m in enumerate(
#         ["january", "february", "march", "april", "may", "june",
#          "july", "august", "september", "october", "november", "december"], 1)}
#     ql = q.lower()
#     mon = next((v for k, v in month_map.items() if k in ql), None)
#     ym = re.search(r"(19|20)\d{2}", q)
#     yr = int(ym.group()) if ym else None
#     return mon, yr

# def extract_date_range(query):
#     month_map = {
#         "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
#         "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
#     }

#     patterns = [
#         r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
#     ]

#     for pattern in patterns:
#         match = re.search(pattern, query)
#         if match:
#             start_month_str, start_year = match.group(1).lower(), int(match.group(2))
#             end_month_str, end_year = match.group(3).lower(), int(match.group(4))

#             start_month = month_map.get(start_month_str)
#             end_month = month_map.get(end_month_str)

#             if start_month and end_month:
#                 start_date = datetime(start_year, start_month, 1)
#                 end_date = datetime(end_year, end_month, 28)
#                 return start_date, end_date

#     return None, None


# def extract_topic_match(query, df):
#     query_lower = query.lower()
#     return df[
#         df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
#         df['Intent'].fillna('').str.lower().str.contains(query_lower) |
#         df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
#         df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
#         df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
#         df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
#     ]


# st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
# st.title("Illinois Legislative Trends Q&A")
# st.markdown("Ask about trends in topics like higher education, funding, etc.")

# df = load_data()
# embed_model, summarizer = load_models()

# query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")

# if query:
#     start_date, end_date = extract_date_range(query)
#     df2 = extract_topic_match(query, df)

#     if df2.empty:
#         df2 = df

#     if start_date and end_date:
#         df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
#         st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
#     else:
#         mon, yr = extract_month_year(query)
#         if yr:
#             df2 = df2[df2['status_date'].dt.year == yr]
#             if mon:
#                 df2 = df2[df2['status_date'].dt.month == mon]
#                 st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
#             else:
#                 st.info(f"Filtering by year: **{yr}**")

#     if df2.empty:
#         st.warning("No matching records found.")
#     else:
#         texts = df2['combined_text'].tolist()
#         embs = compute_embeddings(texts, _model=embed_model)
#         res = semantic_search(query, embs, embed_model, threshold=0.5)

#         if not res:
#             st.warning("No relevant insights found.")
#         else:
#             st.subheader("Top Matching Insights")
#             collected = []

#             for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: 
#                 row = df2.iloc[idx]
#                 date = row['status_date'].date()
#                 bill_number = row['bill_number']
#                 full_url = row['url']
#                 cat = row.get('Category & Subcategory', '')
#                 bene = row.get('Intended Beneficiaries', '')
#                 goal = row.get('Legislative Goal', '')
#                 impact = row.get('Policy Impact Areas', '')
#                 provision = row.get('Key Provisions', '')
#                 intent = row.get('Intent', '')
#                 stance = row.get('Stance', '')
#                 description = row.get('description', '')

#                 st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
#                 st.markdown(f"**Category:** {cat}")
#                 st.markdown(f"**Intended Beneficiaries:** {bene}")
#                 st.markdown(f"**Goal:** {goal}")
#                 st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
#                 st.markdown(f"**Policy Impact Area:** {impact}")
#                 st.markdown(f"**Key Provision:** {provision}")
#                 st.markdown(f"**Description:** {description}")
#                 st.markdown(f"[View Full Bill Text]({full_url})\n")
#                 st.divider()

#                 collected.append(row['combined_text'])

#             st.subheader("RAG-Generated Overall Summary")
#             summary = rag_summarize(collected, summarizer)
#             st.success(summary)


#BART
import streamlit as st
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime

def clean_text(text):
    text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
    text = re.sub(r"(?i)let me know if you'd like.*", "", text)
    text = re.sub(r"(?i)trend summary[:]*", "", text)
    text = re.sub(r"(?i)actionable insight[:]*", "", text)
    return text.strip()

@st.cache_data
def load_data():
    df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
    df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
    df = df.dropna(subset=['status_date'])

    for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
                "Intended Beneficiaries", "Potential Impact", "description"]:
        df[col] = df[col].fillna("")

    df["combined_text"] = (
        "Legislative Goal: " + df["Legislative Goal"] + "\n" +
        "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
        "Key Provisions: " + df["Key Provisions"] + "\n" +
        "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
        "Potential Impact: " + df["Potential Impact"] + "\n" +
        "Description: " + df["description"]
    )

    return df

@st.cache_resource
def load_models():
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    # Changed summarization model to facebook/bart-large-cnn for better summary quality
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
    return embed_model, summarizer

@st.cache_data
def compute_embeddings(texts, _model):
    return _model.encode(texts, show_progress_bar=True)

def semantic_search(query, embeddings, model, threshold=0.5):  
    query_embedding = model.encode([query])
    sims = cosine_similarity(query_embedding, embeddings)[0]
    return [(i, s) for i, s in enumerate(sims) if s > threshold]

def rag_summarize(texts, summarizer, top_k=5):
    if not texts:
        return "No relevant content to summarize."
    vect = TfidfVectorizer()
    m = vect.fit_transform(texts)
    mean_vec = m.mean(axis=0).A
    scores = cosine_similarity(mean_vec, m).flatten()
    top_indices = scores.argsort()[::-1][:top_k]
    ctx = "\n".join(texts[i] for i in top_indices)
    prompt = "summarize: " + ctx[:1024]
    out = summarizer(prompt, max_length=250, min_length=80, do_sample=False)  
    return out[0]['summary_text']

def extract_month_year(q):
    month_map = {m: i for i, m in enumerate(
        ["january", "february", "march", "april", "may", "june",
         "july", "august", "september", "october", "november", "december"], 1)}
    ql = q.lower()
    mon = next((v for k, v in month_map.items() if k in ql), None)
    ym = re.search(r"(19|20)\d{2}", q)
    yr = int(ym.group()) if ym else None
    return mon, yr

def extract_date_range(query):
    month_map = {
        "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
        "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
    }

    patterns = [
        r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
    ]

    for pattern in patterns:
        match = re.search(pattern, query)
        if match:
            start_month_str, start_year = match.group(1).lower(), int(match.group(2))
            end_month_str, end_year = match.group(3).lower(), int(match.group(4))

            start_month = month_map.get(start_month_str)
            end_month = month_map.get(end_month_str)

            if start_month and end_month:
                start_date = datetime(start_year, start_month, 1)
                end_date = datetime(end_year, end_month, 28)
                return start_date, end_date

    return None, None


def extract_topic_match(query, df):
    query_lower = query.lower()
    return df[
        df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
        df['Intent'].fillna('').str.lower().str.contains(query_lower) |
        df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
        df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
        df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
        df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
    ]


st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
st.title("Illinois Legislative Trends Q&A")
st.markdown("Ask about trends in topics like higher education, funding, etc.")

df = load_data()
embed_model, summarizer = load_models()

query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")

if query:
    start_date, end_date = extract_date_range(query)
    df2 = extract_topic_match(query, df)

    if df2.empty:
        df2 = df

    if start_date and end_date:
        df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
        st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
    else:
        mon, yr = extract_month_year(query)
        if yr:
            df2 = df2[df2['status_date'].dt.year == yr]
            if mon:
                df2 = df2[df2['status_date'].dt.month == mon]
                st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
            else:
                st.info(f"Filtering by year: **{yr}**")

    if df2.empty:
        st.warning("No matching records found.")
    else:
        texts = df2['combined_text'].tolist()
        embs = compute_embeddings(texts, _model=embed_model)
        res = semantic_search(query, embs, embed_model, threshold=0.5)

        if not res:
            st.warning("No relevant insights found.")
        else:
            st.subheader("Top Matching Insights")
            collected = []

            for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]: 
                row = df2.iloc[idx]
                date = row['status_date'].date()
                bill_number = row['bill_number']
                full_url = row['url']
                cat = row.get('Category & Subcategory', '')
                bene = row.get('Intended Beneficiaries', '')
                goal = row.get('Legislative Goal', '')
                impact = row.get('Policy Impact Areas', '')
                provision = row.get('Key Provisions', '')
                intent = row.get('Intent', '')
                stance = row.get('Stance', '')
                description = row.get('description', '')

                st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
                st.markdown(f"**Category:** {cat}")
                st.markdown(f"**Intended Beneficiaries:** {bene}")
                st.markdown(f"**Goal:** {goal}")
                st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
                st.markdown(f"**Policy Impact Area:** {impact}")
                st.markdown(f"**Key Provision:** {provision}")
                st.markdown(f"**Description:** {description}")
                st.markdown(f"[View Full Bill Text]({full_url})\n")
                st.divider()

                collected.append(row['combined_text'])

            st.subheader("RAG-Generated Overall Summary")
            summary = rag_summarize(collected, summarizer)
            st.success(summary)