tjl8 commited on
Commit
c134681
·
verified ·
1 Parent(s): e587b94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -57
app.py CHANGED
@@ -130,7 +130,6 @@
130
  # summary = rag_summarize(collected, summarizer)
131
  # st.success(summary)
132
 
133
-
134
  import streamlit as st
135
  import pandas as pd
136
  import re
@@ -140,36 +139,36 @@ from sklearn.metrics.pairwise import cosine_similarity
140
  from sklearn.feature_extraction.text import TfidfVectorizer
141
  from datetime import datetime
142
 
143
- # Load and preprocess the dataset
144
  @st.cache_data
145
  def load_data():
146
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
147
- df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') # Convert dates
148
- df = df.dropna(subset=['status_date']) # Remove rows with invalid dates
149
- df["llama_trend_summary"] = df["llama_trend_summary"].fillna("") # Clean nulls
150
  df["llama_insight"] = df["llama_insight"].fillna("")
151
- df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"] # Combine summaries
152
  return df
153
 
154
- # Load sentence embedding model + summarization model
155
  @st.cache_resource
156
  def load_models():
157
- embed_model = SentenceTransformer('all-MiniLM-L6-v2') # For semantic search
158
- summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") # For final summary
159
  return embed_model, summarizer
160
 
161
- # Generate embeddings from a list of texts
162
  @st.cache_data
163
  def compute_embeddings(texts, _model):
164
  return _model.encode(texts, show_progress_bar=True)
165
 
166
- # Perform semantic search using cosine similarity
167
- def semantic_search(query, embeddings, model, threshold=0.7): # Adjusted threshold to 0.7
168
  query_embedding = model.encode([query])
169
  sims = cosine_similarity(query_embedding, embeddings)[0]
170
  return [(i, s) for i, s in enumerate(sims) if s > threshold]
171
 
172
- # Retrieve top matching texts and summarize them (RAG-like approach)
173
  def rag_summarize(texts, summarizer, top_k=5):
174
  if not texts:
175
  return "No relevant content to summarize."
@@ -177,13 +176,13 @@ def rag_summarize(texts, summarizer, top_k=5):
177
  m = vect.fit_transform(texts)
178
  mean_vec = m.mean(axis=0).A
179
  scores = cosine_similarity(mean_vec, m).flatten()
180
- top_indices = scores.argsort()[::-1][:top_k] # Pick top-k similar insights
181
  ctx = "\n".join(texts[i] for i in top_indices)
182
  prompt = "summarize: " + ctx[:1024]
183
  out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
184
  return out[0]['summary_text']
185
 
186
- # Extract month and year from query (e.g., "May 2024")
187
  def extract_month_year(q):
188
  month_map = {m: i for i, m in enumerate(
189
  ["january", "february", "march", "april", "may", "june",
@@ -194,69 +193,58 @@ def extract_month_year(q):
194
  yr = int(ym.group()) if ym else None
195
  return mon, yr
196
 
197
- # Try to detect a category mentioned in the query
198
- def extract_category(q, cats):
199
- ql = q.lower()
200
- for cat in cats:
201
- if pd.isna(cat): continue
202
- if any(tok in ql for tok in cat.lower().split()):
203
- return cat
204
- return None
205
-
206
- # ---- Streamlit Interface ---- #
 
 
207
  st.set_page_config(page_title="IL Trends Q&A", layout="wide")
208
  st.title("Illinois Legislative Trends Q&A")
 
209
 
210
- # Load the dataset and models
211
  df = load_data()
212
  embed_model, summarizer = load_models()
213
 
214
- # User enters question
215
- query = st.text_input("Ask a question (e.g., ‘trends in higher education in May 2024’):")
216
 
217
  if query:
218
- # Extract date or category from user question
219
  mon, yr = extract_month_year(query)
220
- cats = df['category_&_subcategory_standardized'].unique()
221
- cat = extract_category(query, cats)
222
 
223
- df2 = df.copy()
224
-
225
- # Filter if query includes "opposed"
226
- if "opposed" in query.lower():
227
- df2 = df2[df2['stance_standardized'].str.lower() == "opposed"]
228
- st.info("🔎 Filtering for bills where stance is **opposed**")
229
-
230
- # Filter by detected category
231
- if cat:
232
- df2 = df2[df2['category_&_subcategory_standardized'] == cat]
233
- st.info(f"🔎 Filtering by category: **{cat}**")
234
 
235
- # Filter by year/month if detected
236
  if yr:
237
  df2 = df2[df2['status_date'].dt.year == yr]
238
  if mon:
239
  df2 = df2[df2['status_date'].dt.month == mon]
240
- st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
241
  else:
242
- st.info(f" Filtering by year: **{yr}**")
243
 
244
- # If no data after filtering
245
  if df2.empty:
246
  st.warning("No matching records found.")
247
  else:
248
- # Generate semantic matches
249
  texts = df2['summary_insight'].tolist()
250
  embs = compute_embeddings(texts, _model=embed_model)
251
- res = semantic_search(query, embs, embed_model) # Uses threshold=0.7
252
 
253
  if not res:
254
  st.warning("No relevant insights found.")
255
  else:
256
- st.subheader("Top Matching Insights")
257
  collected = []
258
 
259
- # Display top matches with metadata
260
  for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
261
  row = df2.iloc[idx]
262
  date = row['status_date'].date()
@@ -266,15 +254,15 @@ if query:
266
  stance = row['stance_standardized']
267
  trend_summary = row['llama_trend_summary'].strip()
268
 
269
- st.markdown(f"- **Date:** {date} | **Score:** {score:.2f}")
270
- st.markdown(f" - **Category:** {cat_std}")
271
- st.markdown(f" - **Goal:** {goal}")
272
- st.markdown(f" - **Intent:** {intent} | **Stance:** {stance}")
273
- st.markdown(f" > **Trend Summary:** {trend_summary}")
274
 
275
  collected.append(row['summary_insight'])
276
 
277
- # RAG-generated summary from top matching insights
278
- st.subheader("RAG-Generated Summary")
279
  summary = rag_summarize(collected, summarizer)
280
  st.success(summary)
 
130
  # summary = rag_summarize(collected, summarizer)
131
  # st.success(summary)
132
 
 
133
  import streamlit as st
134
  import pandas as pd
135
  import re
 
139
  from sklearn.feature_extraction.text import TfidfVectorizer
140
  from datetime import datetime
141
 
142
+ # ------------------ Load Data ------------------ #
143
  @st.cache_data
144
  def load_data():
145
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
146
+ df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
147
+ df = df.dropna(subset=['status_date'])
148
+ df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
149
  df["llama_insight"] = df["llama_insight"].fillna("")
150
+ df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
151
  return df
152
 
153
+ # ------------------ Load Models ------------------ #
154
  @st.cache_resource
155
  def load_models():
156
+ embed_model = SentenceTransformer('all-MiniLM-L6-v2')
157
+ summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
158
  return embed_model, summarizer
159
 
160
+ # ------------------ Compute Embeddings ------------------ #
161
  @st.cache_data
162
  def compute_embeddings(texts, _model):
163
  return _model.encode(texts, show_progress_bar=True)
164
 
165
+ # ------------------ Semantic Search ------------------ #
166
+ def semantic_search(query, embeddings, model, threshold=0.5): # Increased threshold to 0.7
167
  query_embedding = model.encode([query])
168
  sims = cosine_similarity(query_embedding, embeddings)[0]
169
  return [(i, s) for i, s in enumerate(sims) if s > threshold]
170
 
171
+ # ------------------ RAG Summarizer ------------------ #
172
  def rag_summarize(texts, summarizer, top_k=5):
173
  if not texts:
174
  return "No relevant content to summarize."
 
176
  m = vect.fit_transform(texts)
177
  mean_vec = m.mean(axis=0).A
178
  scores = cosine_similarity(mean_vec, m).flatten()
179
+ top_indices = scores.argsort()[::-1][:top_k]
180
  ctx = "\n".join(texts[i] for i in top_indices)
181
  prompt = "summarize: " + ctx[:1024]
182
  out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
183
  return out[0]['summary_text']
184
 
185
+ # ------------------ Extract Month/Year from Query ------------------ #
186
  def extract_month_year(q):
187
  month_map = {m: i for i, m in enumerate(
188
  ["january", "february", "march", "april", "may", "june",
 
193
  yr = int(ym.group()) if ym else None
194
  return mon, yr
195
 
196
+ # ------------------ Topic-Based Matching ------------------ #
197
+ def extract_topic_match(query, df):
198
+ query_lower = query.lower()
199
+ matched_rows = df[
200
+ df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
201
+ df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
202
+ df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
203
+ df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
204
+ ]
205
+ return matched_rows
206
+
207
+ # ------------------ Streamlit UI ------------------ #
208
  st.set_page_config(page_title="IL Trends Q&A", layout="wide")
209
  st.title("Illinois Legislative Trends Q&A")
210
+ st.markdown("Ask about **topics** like education, housing, mental health, higher education, etc.\nAlso supports filtering by **month/year**!")
211
 
 
212
  df = load_data()
213
  embed_model, summarizer = load_models()
214
 
215
+ query = st.text_input("🔍 Ask a question (e.g., ‘Higher education in 2024’):")
 
216
 
217
  if query:
218
+ # Extract filters
219
  mon, yr = extract_month_year(query)
220
+ df2 = extract_topic_match(query, df)
 
221
 
222
+ # Fallback to full dataset if nothing found on topic
223
+ if df2.empty:
224
+ df2 = df
 
 
 
 
 
 
 
 
225
 
226
+ # Apply year/month filters
227
  if yr:
228
  df2 = df2[df2['status_date'].dt.year == yr]
229
  if mon:
230
  df2 = df2[df2['status_date'].dt.month == mon]
231
+ st.info(f"🔎 Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
232
  else:
233
+ st.info(f"🔎 Filtering by year: **{yr}**")
234
 
 
235
  if df2.empty:
236
  st.warning("No matching records found.")
237
  else:
 
238
  texts = df2['summary_insight'].tolist()
239
  embs = compute_embeddings(texts, _model=embed_model)
240
+ res = semantic_search(query, embs, embed_model)
241
 
242
  if not res:
243
  st.warning("No relevant insights found.")
244
  else:
245
+ st.subheader(" Top Matching Insights")
246
  collected = []
247
 
 
248
  for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
249
  row = df2.iloc[idx]
250
  date = row['status_date'].date()
 
254
  stance = row['stance_standardized']
255
  trend_summary = row['llama_trend_summary'].strip()
256
 
257
+ st.markdown(f"- ** Date:** {date} | **🔗 Score:** {score:.2f}")
258
+ st.markdown(f" - ** Category:** {cat_std}")
259
+ st.markdown(f" - ** Goal:** {goal}")
260
+ st.markdown(f" - ** Intent:** {intent} | **⚖️ Stance:** {stance}")
261
+ st.markdown(f" > ** Trend Summary:** {trend_summary}")
262
 
263
  collected.append(row['summary_insight'])
264
 
265
+ # RAG Summary
266
+ st.subheader(" RAG-Generated Summary")
267
  summary = rag_summarize(collected, summarizer)
268
  st.success(summary)