Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
|
| 3 |
import streamlit as st
|
| 4 |
import pandas as pd
|
| 5 |
import re
|
|
@@ -40,6 +38,8 @@ def semantic_search(query, embeddings, model, threshold=0.4):
|
|
| 40 |
|
| 41 |
# RAG summarization
|
| 42 |
def rag_summarize(group_texts, summarizer, top_k=5):
|
|
|
|
|
|
|
| 43 |
vectorizer = TfidfVectorizer()
|
| 44 |
tfidf_matrix = vectorizer.fit_transform(group_texts)
|
| 45 |
mean_vector = tfidf_matrix.mean(axis=0).A
|
|
@@ -50,92 +50,60 @@ def rag_summarize(group_texts, summarizer, top_k=5):
|
|
| 50 |
result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
|
| 51 |
return result[0]['summary_text']
|
| 52 |
|
| 53 |
-
#
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
trend_level = None
|
| 57 |
-
if "monthly" in query:
|
| 58 |
-
trend_level = "monthly"
|
| 59 |
-
elif "quarterly" in query:
|
| 60 |
-
trend_level = "quarterly"
|
| 61 |
-
elif "yearly" in query or "annual" in query:
|
| 62 |
-
trend_level = "yearly"
|
| 63 |
-
|
| 64 |
-
year_match = re.search(r"(19|20)\d{2}", query)
|
| 65 |
-
year = int(year_match.group()) if year_match else None
|
| 66 |
-
|
| 67 |
-
return trend_level, year
|
| 68 |
-
|
| 69 |
-
# Group and summarize
|
| 70 |
-
def generate_trend_summary(df_filtered, level, summarizer):
|
| 71 |
-
if level == "monthly":
|
| 72 |
-
df_filtered['month'] = df_filtered['status_date'].dt.to_period('M').apply(lambda r: r.start_time)
|
| 73 |
-
grouped = df_filtered.groupby('month')['summary_insight'].apply(list).reset_index()
|
| 74 |
-
elif level == "quarterly":
|
| 75 |
-
df_filtered['quarter'] = df_filtered['status_date'].dt.to_period('Q').apply(lambda r: r.start_time)
|
| 76 |
-
grouped = df_filtered.groupby('quarter')['summary_insight'].apply(list).reset_index()
|
| 77 |
-
elif level == "yearly":
|
| 78 |
-
df_filtered['year'] = df_filtered['status_date'].dt.year
|
| 79 |
-
grouped = df_filtered.groupby('year')['summary_insight'].apply(list).reset_index()
|
| 80 |
-
else:
|
| 81 |
-
return None # Should not happen
|
| 82 |
-
|
| 83 |
-
summaries = []
|
| 84 |
-
for i, row in grouped.iterrows():
|
| 85 |
-
summary = rag_summarize(row['summary_insight'], summarizer)
|
| 86 |
-
summaries.append((row[0], summary))
|
| 87 |
-
|
| 88 |
-
return summaries
|
| 89 |
|
| 90 |
-
#
|
| 91 |
-
st.set_page_config(page_title="Illinois Legislative Explorer", layout="wide")
|
| 92 |
-
st.title("π Illinois Bill Trends Explorer (Monthly, Quarterly, Yearly)")
|
| 93 |
-
st.markdown("Ask a question like:")
|
| 94 |
-
st.markdown("- *What are the monthly trends in 2024?*")
|
| 95 |
-
st.markdown("- *Give me quarterly updates for 2023*")
|
| 96 |
-
st.markdown("- *Yearly trends in equity bills 2022*")
|
| 97 |
-
st.markdown("- *What are the bills about clean energy?*")
|
| 98 |
-
|
| 99 |
-
# Load
|
| 100 |
df = load_data()
|
| 101 |
embed_model, summarizer = load_models()
|
| 102 |
|
| 103 |
-
#
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
|
|
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
df_filtered = df
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
st.subheader(f"π
{period.strftime('%B %Y') if trend_level == 'monthly' else period.strftime('%Y Q%q') if trend_level == 'quarterly' else str(period)}")
|
| 125 |
-
st.success(summary)
|
| 126 |
-
else:
|
| 127 |
-
st.warning("No trends found for the selected timeline.")
|
| 128 |
|
|
|
|
|
|
|
|
|
|
| 129 |
else:
|
| 130 |
-
# No trend level -> semantic search for individual bills
|
| 131 |
-
st.info("No trend level mentioned β showing top relevant bills from the data.")
|
| 132 |
embeddings = compute_embeddings(df_filtered["summary_insight"].tolist(), _model=embed_model)
|
| 133 |
-
results = semantic_search(query, embeddings, embed_model)
|
| 134 |
|
| 135 |
if not results:
|
| 136 |
-
st.warning("No relevant
|
| 137 |
else:
|
|
|
|
|
|
|
| 138 |
for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
|
| 139 |
row = df_filtered.iloc[idx]
|
| 140 |
-
st.markdown(f"**π
Date:** {row['status_date'].date()} | **Score:** {score:.2f}")
|
|
|
|
|
|
|
|
|
|
| 141 |
st.markdown(f"```{row['summary_insight'][:500]}```")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import re
|
|
|
|
| 38 |
|
| 39 |
# RAG summarization
|
| 40 |
def rag_summarize(group_texts, summarizer, top_k=5):
|
| 41 |
+
if not group_texts:
|
| 42 |
+
return "No relevant content to summarize."
|
| 43 |
vectorizer = TfidfVectorizer()
|
| 44 |
tfidf_matrix = vectorizer.fit_transform(group_texts)
|
| 45 |
mean_vector = tfidf_matrix.mean(axis=0).A
|
|
|
|
| 50 |
result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
|
| 51 |
return result[0]['summary_text']
|
| 52 |
|
| 53 |
+
# Streamlit UI
|
| 54 |
+
st.set_page_config(page_title="Illinois Trends Q&A", layout="wide")
|
| 55 |
+
st.title("π Illinois Trends Explorer")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
# Load data & models
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
df = load_data()
|
| 59 |
embed_model, summarizer = load_models()
|
| 60 |
|
| 61 |
+
# Sidebar filters
|
| 62 |
+
st.sidebar.header("π
Filter Options")
|
| 63 |
+
years = sorted(df['status_date'].dt.year.unique(), reverse=True)
|
| 64 |
+
months = list(range(1, 13))
|
| 65 |
+
month_names = {i: datetime(2000, i, 1).strftime('%B') for i in months}
|
| 66 |
|
| 67 |
+
selected_year = st.sidebar.selectbox("Select Year", years)
|
| 68 |
+
selected_month = st.sidebar.selectbox("Select Month (optional)", [None] + months, format_func=lambda x: "All" if x is None else month_names[x])
|
| 69 |
+
selected_category = st.sidebar.selectbox("Select Category (optional)", ["All"] + sorted(df['category_&_subcategory_standardized'].dropna().unique()))
|
| 70 |
|
| 71 |
+
# Filter data
|
| 72 |
+
df_filtered = df[df['status_date'].dt.year == selected_year]
|
| 73 |
+
if selected_month:
|
| 74 |
+
df_filtered = df_filtered[df_filtered['status_date'].dt.month == selected_month]
|
| 75 |
+
if selected_category != "All":
|
| 76 |
+
df_filtered = df_filtered[df_filtered['category_&_subcategory_standardized'] == selected_category]
|
|
|
|
| 77 |
|
| 78 |
+
# Summary statistics
|
| 79 |
+
st.markdown(f"### π Top Categories in {month_names.get(selected_month, 'All Months')} {selected_year}")
|
| 80 |
+
top_cats = df_filtered['category_&_subcategory_standardized'].value_counts().head(5)
|
| 81 |
+
st.bar_chart(top_cats)
|
| 82 |
|
| 83 |
+
# Question input
|
| 84 |
+
query = st.text_input("π Ask your question about trends:")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
if query:
|
| 87 |
+
if df_filtered.empty:
|
| 88 |
+
st.warning("No data available for this filter.")
|
| 89 |
else:
|
|
|
|
|
|
|
| 90 |
embeddings = compute_embeddings(df_filtered["summary_insight"].tolist(), _model=embed_model)
|
| 91 |
+
results = semantic_search(query, embeddings, embed_model, threshold=0.4)
|
| 92 |
|
| 93 |
if not results:
|
| 94 |
+
st.warning("No relevant insights found.")
|
| 95 |
else:
|
| 96 |
+
top_texts = []
|
| 97 |
+
st.subheader("π Top Matching Insights:")
|
| 98 |
for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
|
| 99 |
row = df_filtered.iloc[idx]
|
| 100 |
+
st.markdown(f"**π
Date:** {row['status_date'].date()} | **π’ Score:** {score:.2f}")
|
| 101 |
+
st.markdown(f"**π Title:** {row['title']}")
|
| 102 |
+
st.markdown(f"**π·οΈ Category:** {row['category_&_subcategory_standardized']} | **π― Goal:** {row['legislative_goal_standardized']}")
|
| 103 |
+
st.markdown(f"**π§ Intent:** {row['intent_standardized']} | **βοΈ Stance:** {row['stance_standardized']}")
|
| 104 |
st.markdown(f"```{row['summary_insight'][:500]}```")
|
| 105 |
+
top_texts.append(row['summary_insight'])
|
| 106 |
+
|
| 107 |
+
st.subheader("π§ RAG-Generated Summary:")
|
| 108 |
+
summary = rag_summarize(top_texts, summarizer, top_k=5)
|
| 109 |
+
st.success(summary)
|