tjl8 commited on
Commit
2845580
·
verified ·
1 Parent(s): 60bbe7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +222 -82
app.py CHANGED
@@ -1,3 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # import streamlit as st
2
  # import pandas as pd
3
  # import re
@@ -7,7 +139,7 @@
7
  # from sklearn.feature_extraction.text import TfidfVectorizer
8
  # from datetime import datetime
9
 
10
- # # Load data
11
  # @st.cache_data
12
  # def load_data():
13
  # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
@@ -18,25 +150,22 @@
18
  # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
19
  # return df
20
 
21
- # # Load models
22
  # @st.cache_resource
23
  # def load_models():
24
  # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
25
  # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
26
  # return embed_model, summarizer
27
 
28
- # # Compute embeddings
29
  # @st.cache_data
30
  # def compute_embeddings(texts, _model):
31
  # return _model.encode(texts, show_progress_bar=True)
32
 
33
- # # Semantic search
34
- # def semantic_search(query, embeddings, model, threshold=0.4):
35
  # query_embedding = model.encode([query])
36
  # sims = cosine_similarity(query_embedding, embeddings)[0]
37
  # return [(i, s) for i, s in enumerate(sims) if s > threshold]
38
 
39
- # # RAG summarization
40
  # def rag_summarize(texts, summarizer, top_k=5):
41
  # if not texts:
42
  # return "No relevant content to summarize."
@@ -50,7 +179,6 @@
50
  # out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
  # return out[0]['summary_text']
52
 
53
- # # Parse month/year
54
  # def extract_month_year(q):
55
  # month_map = {m: i for i, m in enumerate(
56
  # ["january", "february", "march", "april", "may", "june",
@@ -61,40 +189,39 @@
61
  # yr = int(ym.group()) if ym else None
62
  # return mon, yr
63
 
64
- # # Auto-detect category
65
- # def extract_category(q, cats):
66
- # ql = q.lower()
67
- # for cat in cats:
68
- # if pd.isna(cat): continue
69
- # if any(tok in ql for tok in cat.lower().split()):
70
- # return cat
71
- # return None
 
 
72
 
73
- # # Streamlit UI
74
  # st.set_page_config(page_title="IL Trends Q&A", layout="wide")
75
  # st.title("Illinois Legislative Trends Q&A")
 
76
 
77
  # df = load_data()
78
  # embed_model, summarizer = load_models()
79
 
80
- # query = st.text_input("Ask a question (e.g., ‘education in May 2024’):")
81
 
82
  # if query:
83
  # mon, yr = extract_month_year(query)
84
- # cats = df['category_&_subcategory_standardized'].unique()
85
- # cat = extract_category(query, cats)
86
 
87
- # df2 = df.copy()
88
- # if cat:
89
- # df2 = df2[df2['category_&_subcategory_standardized'] == cat]
90
- # st.info(f"Filtering by category: **{cat}**")
91
  # if yr:
92
  # df2 = df2[df2['status_date'].dt.year == yr]
93
  # if mon:
94
  # df2 = df2[df2['status_date'].dt.month == mon]
95
  # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
96
  # else:
97
- # st.info(f" Filtering by year: **{yr}**")
98
 
99
  # if df2.empty:
100
  # st.warning("No matching records found.")
@@ -106,7 +233,7 @@
106
  # if not res:
107
  # st.warning("No relevant insights found.")
108
  # else:
109
- # st.subheader("Top Matching Insights")
110
  # collected = []
111
 
112
  # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
@@ -118,11 +245,11 @@
118
  # stance = row['stance_standardized']
119
  # trend_summary = row['llama_trend_summary'].strip()
120
 
121
- # st.markdown(f"- **Date:** {date} | **Score:** {score:.2f}")
122
- # st.markdown(f" - **Category:** {cat_std}")
123
- # st.markdown(f" - **Goal:** {goal}")
124
- # st.markdown(f" - **Intent:** {intent} | **Stance:** {stance}")
125
- # st.markdown(f" > **Trend Summary:** {trend_summary}")
126
 
127
  # collected.append(row['summary_insight'])
128
 
@@ -130,16 +257,19 @@
130
  # summary = rag_summarize(collected, summarizer)
131
  # st.success(summary)
132
 
 
133
  import streamlit as st
134
  import pandas as pd
135
  import re
 
136
  from sentence_transformers import SentenceTransformer
137
  from transformers import pipeline
138
  from sklearn.metrics.pairwise import cosine_similarity
139
  from sklearn.feature_extraction.text import TfidfVectorizer
140
  from datetime import datetime
 
141
 
142
- # loading data
143
  @st.cache_data
144
  def load_data():
145
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
@@ -150,22 +280,25 @@ def load_data():
150
  df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
151
  return df
152
 
 
153
  @st.cache_resource
154
  def load_models():
155
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
156
  summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
157
  return embed_model, summarizer
158
 
 
159
  @st.cache_data
160
  def compute_embeddings(texts, _model):
161
  return _model.encode(texts, show_progress_bar=True)
162
 
163
- def semantic_search(query, embeddings, model, threshold=0.5):
 
164
  query_embedding = model.encode([query])
165
  sims = cosine_similarity(query_embedding, embeddings)[0]
166
  return [(i, s) for i, s in enumerate(sims) if s > threshold]
167
 
168
-
169
  def rag_summarize(texts, summarizer, top_k=5):
170
  if not texts:
171
  return "No relevant content to summarize."
@@ -179,80 +312,87 @@ def rag_summarize(texts, summarizer, top_k=5):
179
  out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
180
  return out[0]['summary_text']
181
 
182
- def extract_month_year(q):
 
 
 
 
 
183
  month_map = {m: i for i, m in enumerate(
184
  ["january", "february", "march", "april", "may", "june",
185
  "july", "august", "september", "october", "november", "december"], 1)}
186
- ql = q.lower()
187
  mon = next((v for k, v in month_map.items() if k in ql), None)
188
- ym = re.search(r"(19|20)\d{2}", q)
189
  yr = int(ym.group()) if ym else None
190
- return mon, yr
191
 
192
- def extract_topic_match(query, df):
193
- query_lower = query.lower()
194
- matched_rows = df[
195
- df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
196
- df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
197
- df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
198
- df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
199
- ]
200
- return matched_rows
201
 
 
 
 
202
 
203
- st.set_page_config(page_title="IL Trends Q&A", layout="wide")
204
- st.title("Illinois Legislative Trends Q&A")
205
- st.markdown("Ask about trends in **topics** like education, higher education, etc!")
206
-
207
  df = load_data()
208
  embed_model, summarizer = load_models()
209
 
210
- query = st.text_input(" Ask a question (e.g., ‘trends in Higher education in 2024’):")
211
 
212
  if query:
213
- mon, yr = extract_month_year(query)
214
- df2 = extract_topic_match(query, df)
215
-
216
- if df2.empty:
217
- df2 = df
218
- if yr:
219
- df2 = df2[df2['status_date'].dt.year == yr]
220
- if mon:
221
- df2 = df2[df2['status_date'].dt.month == mon]
222
- st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
223
- else:
224
- st.info(f"Filtering by year: **{yr}**")
225
 
226
- if df2.empty:
227
- st.warning("No matching records found.")
228
  else:
229
- texts = df2['summary_insight'].tolist()
 
230
  embs = compute_embeddings(texts, _model=embed_model)
231
- res = semantic_search(query, embs, embed_model)
232
 
233
- if not res:
 
 
 
234
  st.warning("No relevant insights found.")
235
  else:
236
- st.subheader(" Top Matching Insights")
237
- collected = []
 
238
 
239
- for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
240
- row = df2.iloc[idx]
241
  date = row['status_date'].date()
242
- cat_std = row['category_&_subcategory_standardized']
243
  goal = row['legislative_goal_standardized']
244
  intent = row['intent_standardized']
245
  stance = row['stance_standardized']
246
  trend_summary = row['llama_trend_summary'].strip()
 
 
 
 
 
 
 
 
 
 
247
 
248
- st.markdown(f"- ** Date:** {date} | ** Score:** {score:.2f}")
249
- st.markdown(f" - ** Category:** {cat_std}")
250
- st.markdown(f" - ** Goal:** {goal}")
251
- st.markdown(f" - ** Intent:** {intent} | ** Stance:** {stance}")
252
- st.markdown(f" > ** Trend Summary:** {trend_summary}")
253
 
254
- collected.append(row['summary_insight'])
 
 
 
255
 
256
- st.subheader(" RAG-Generated Summary")
257
- summary = rag_summarize(collected, summarizer)
258
- st.success(summary)
 
1
+ # # import streamlit as st
2
+ # # import pandas as pd
3
+ # # import re
4
+ # # from sentence_transformers import SentenceTransformer
5
+ # # from transformers import pipeline
6
+ # # from sklearn.metrics.pairwise import cosine_similarity
7
+ # # from sklearn.feature_extraction.text import TfidfVectorizer
8
+ # # from datetime import datetime
9
+
10
+ # # # Load data
11
+ # # @st.cache_data
12
+ # # def load_data():
13
+ # # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
14
+ # # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
15
+ # # df = df.dropna(subset=['status_date'])
16
+ # # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
17
+ # # df["llama_insight"] = df["llama_insight"].fillna("")
18
+ # # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
19
+ # # return df
20
+
21
+ # # # Load models
22
+ # # @st.cache_resource
23
+ # # def load_models():
24
+ # # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
25
+ # # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
26
+ # # return embed_model, summarizer
27
+
28
+ # # # Compute embeddings
29
+ # # @st.cache_data
30
+ # # def compute_embeddings(texts, _model):
31
+ # # return _model.encode(texts, show_progress_bar=True)
32
+
33
+ # # # Semantic search
34
+ # # def semantic_search(query, embeddings, model, threshold=0.4):
35
+ # # query_embedding = model.encode([query])
36
+ # # sims = cosine_similarity(query_embedding, embeddings)[0]
37
+ # # return [(i, s) for i, s in enumerate(sims) if s > threshold]
38
+
39
+ # # # RAG summarization
40
+ # # def rag_summarize(texts, summarizer, top_k=5):
41
+ # # if not texts:
42
+ # # return "No relevant content to summarize."
43
+ # # vect = TfidfVectorizer()
44
+ # # m = vect.fit_transform(texts)
45
+ # # mean_vec = m.mean(axis=0).A
46
+ # # scores = cosine_similarity(mean_vec, m).flatten()
47
+ # # top_indices = scores.argsort()[::-1][:top_k]
48
+ # # ctx = "\n".join(texts[i] for i in top_indices)
49
+ # # prompt = "summarize: " + ctx[:1024]
50
+ # # out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
+ # # return out[0]['summary_text']
52
+
53
+ # # # Parse month/year
54
+ # # def extract_month_year(q):
55
+ # # month_map = {m: i for i, m in enumerate(
56
+ # # ["january", "february", "march", "april", "may", "june",
57
+ # # "july", "august", "september", "october", "november", "december"], 1)}
58
+ # # ql = q.lower()
59
+ # # mon = next((v for k, v in month_map.items() if k in ql), None)
60
+ # # ym = re.search(r"(19|20)\d{2}", q)
61
+ # # yr = int(ym.group()) if ym else None
62
+ # # return mon, yr
63
+
64
+ # # # Auto-detect category
65
+ # # def extract_category(q, cats):
66
+ # # ql = q.lower()
67
+ # # for cat in cats:
68
+ # # if pd.isna(cat): continue
69
+ # # if any(tok in ql for tok in cat.lower().split()):
70
+ # # return cat
71
+ # # return None
72
+
73
+ # # # Streamlit UI
74
+ # # st.set_page_config(page_title="IL Trends Q&A", layout="wide")
75
+ # # st.title("Illinois Legislative Trends Q&A")
76
+
77
+ # # df = load_data()
78
+ # # embed_model, summarizer = load_models()
79
+
80
+ # # query = st.text_input("Ask a question (e.g., ‘education in May 2024’):")
81
+
82
+ # # if query:
83
+ # # mon, yr = extract_month_year(query)
84
+ # # cats = df['category_&_subcategory_standardized'].unique()
85
+ # # cat = extract_category(query, cats)
86
+
87
+ # # df2 = df.copy()
88
+ # # if cat:
89
+ # # df2 = df2[df2['category_&_subcategory_standardized'] == cat]
90
+ # # st.info(f"Filtering by category: **{cat}**")
91
+ # # if yr:
92
+ # # df2 = df2[df2['status_date'].dt.year == yr]
93
+ # # if mon:
94
+ # # df2 = df2[df2['status_date'].dt.month == mon]
95
+ # # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
96
+ # # else:
97
+ # # st.info(f" Filtering by year: **{yr}**")
98
+
99
+ # # if df2.empty:
100
+ # # st.warning("No matching records found.")
101
+ # # else:
102
+ # # texts = df2['summary_insight'].tolist()
103
+ # # embs = compute_embeddings(texts, _model=embed_model)
104
+ # # res = semantic_search(query, embs, embed_model)
105
+
106
+ # # if not res:
107
+ # # st.warning("No relevant insights found.")
108
+ # # else:
109
+ # # st.subheader("Top Matching Insights")
110
+ # # collected = []
111
+
112
+ # # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
113
+ # # row = df2.iloc[idx]
114
+ # # date = row['status_date'].date()
115
+ # # cat_std = row['category_&_subcategory_standardized']
116
+ # # goal = row['legislative_goal_standardized']
117
+ # # intent = row['intent_standardized']
118
+ # # stance = row['stance_standardized']
119
+ # # trend_summary = row['llama_trend_summary'].strip()
120
+
121
+ # # st.markdown(f"- **Date:** {date} | **Score:** {score:.2f}")
122
+ # # st.markdown(f" - **Category:** {cat_std}")
123
+ # # st.markdown(f" - **Goal:** {goal}")
124
+ # # st.markdown(f" - **Intent:** {intent} | **Stance:** {stance}")
125
+ # # st.markdown(f" > **Trend Summary:** {trend_summary}")
126
+
127
+ # # collected.append(row['summary_insight'])
128
+
129
+ # # st.subheader(" RAG-Generated Summary")
130
+ # # summary = rag_summarize(collected, summarizer)
131
+ # # st.success(summary)
132
+
133
  # import streamlit as st
134
  # import pandas as pd
135
  # import re
 
139
  # from sklearn.feature_extraction.text import TfidfVectorizer
140
  # from datetime import datetime
141
 
142
+ # # loading data
143
  # @st.cache_data
144
  # def load_data():
145
  # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
 
150
  # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
151
  # return df
152
 
 
153
  # @st.cache_resource
154
  # def load_models():
155
  # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
156
  # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
157
  # return embed_model, summarizer
158
 
 
159
  # @st.cache_data
160
  # def compute_embeddings(texts, _model):
161
  # return _model.encode(texts, show_progress_bar=True)
162
 
163
+ # def semantic_search(query, embeddings, model, threshold=0.5):
 
164
  # query_embedding = model.encode([query])
165
  # sims = cosine_similarity(query_embedding, embeddings)[0]
166
  # return [(i, s) for i, s in enumerate(sims) if s > threshold]
167
 
168
+
169
  # def rag_summarize(texts, summarizer, top_k=5):
170
  # if not texts:
171
  # return "No relevant content to summarize."
 
179
  # out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
180
  # return out[0]['summary_text']
181
 
 
182
  # def extract_month_year(q):
183
  # month_map = {m: i for i, m in enumerate(
184
  # ["january", "february", "march", "april", "may", "june",
 
189
  # yr = int(ym.group()) if ym else None
190
  # return mon, yr
191
 
192
+ # def extract_topic_match(query, df):
193
+ # query_lower = query.lower()
194
+ # matched_rows = df[
195
+ # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
196
+ # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
197
+ # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
198
+ # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
199
+ # ]
200
+ # return matched_rows
201
+
202
 
 
203
  # st.set_page_config(page_title="IL Trends Q&A", layout="wide")
204
  # st.title("Illinois Legislative Trends Q&A")
205
+ # st.markdown("Ask about trends in **topics** like education, higher education, etc!")
206
 
207
  # df = load_data()
208
  # embed_model, summarizer = load_models()
209
 
210
+ # query = st.text_input(" Ask a question (e.g., ‘trends in Higher education in 2024’):")
211
 
212
  # if query:
213
  # mon, yr = extract_month_year(query)
214
+ # df2 = extract_topic_match(query, df)
 
215
 
216
+ # if df2.empty:
217
+ # df2 = df
 
 
218
  # if yr:
219
  # df2 = df2[df2['status_date'].dt.year == yr]
220
  # if mon:
221
  # df2 = df2[df2['status_date'].dt.month == mon]
222
  # st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
223
  # else:
224
+ # st.info(f"Filtering by year: **{yr}**")
225
 
226
  # if df2.empty:
227
  # st.warning("No matching records found.")
 
233
  # if not res:
234
  # st.warning("No relevant insights found.")
235
  # else:
236
+ # st.subheader(" Top Matching Insights")
237
  # collected = []
238
 
239
  # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
 
245
  # stance = row['stance_standardized']
246
  # trend_summary = row['llama_trend_summary'].strip()
247
 
248
+ # st.markdown(f"- ** Date:** {date} | ** Score:** {score:.2f}")
249
+ # st.markdown(f" - ** Category:** {cat_std}")
250
+ # st.markdown(f" - ** Goal:** {goal}")
251
+ # st.markdown(f" - ** Intent:** {intent} | ** Stance:** {stance}")
252
+ # st.markdown(f" > ** Trend Summary:** {trend_summary}")
253
 
254
  # collected.append(row['summary_insight'])
255
 
 
257
  # summary = rag_summarize(collected, summarizer)
258
  # st.success(summary)
259
 
260
+
261
  import streamlit as st
262
  import pandas as pd
263
  import re
264
+ import dateparser # for natural language date parsing
265
  from sentence_transformers import SentenceTransformer
266
  from transformers import pipeline
267
  from sklearn.metrics.pairwise import cosine_similarity
268
  from sklearn.feature_extraction.text import TfidfVectorizer
269
  from datetime import datetime
270
+ from io import StringIO
271
 
272
+ # Load data
273
  @st.cache_data
274
  def load_data():
275
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
 
280
  df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
281
  return df
282
 
283
+ # Load models
284
  @st.cache_resource
285
  def load_models():
286
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
287
  summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
288
  return embed_model, summarizer
289
 
290
+ # Compute embeddings
291
  @st.cache_data
292
  def compute_embeddings(texts, _model):
293
  return _model.encode(texts, show_progress_bar=True)
294
 
295
+ # Semantic search
296
+ def semantic_search(query, embeddings, model, threshold=0.7):
297
  query_embedding = model.encode([query])
298
  sims = cosine_similarity(query_embedding, embeddings)[0]
299
  return [(i, s) for i, s in enumerate(sims) if s > threshold]
300
 
301
+ # RAG summarization
302
  def rag_summarize(texts, summarizer, top_k=5):
303
  if not texts:
304
  return "No relevant content to summarize."
 
312
  out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
313
  return out[0]['summary_text']
314
 
315
+ # Enhanced date parsing with dateparser for flexible queries
316
+ def parse_date_from_query(query):
317
+ dt = dateparser.parse(query, settings={'PREFER_DATES_FROM': 'past'})
318
+ if dt:
319
+ return dt.year, dt.month
320
+ # fallback: regex extract year and month names
321
  month_map = {m: i for i, m in enumerate(
322
  ["january", "february", "march", "april", "may", "june",
323
  "july", "august", "september", "october", "november", "december"], 1)}
324
+ ql = query.lower()
325
  mon = next((v for k, v in month_map.items() if k in ql), None)
326
+ ym = re.search(r"(19|20)\d{2}", query)
327
  yr = int(ym.group()) if ym else None
328
+ return yr, mon
329
 
330
+ # Simple keyword highlighter
331
+ def highlight_keywords(text, keywords):
332
+ for kw in keywords:
333
+ text = re.sub(f"(?i)({re.escape(kw)})", r"**\1**", text)
334
+ return text
 
 
 
 
335
 
336
+ # Streamlit UI
337
+ st.set_page_config(page_title="IL Trends Q&A Enhanced", layout="wide")
338
+ st.title("Illinois Legislative Trends Q&A with Extras")
339
 
340
+ # Load data & models
 
 
 
341
  df = load_data()
342
  embed_model, summarizer = load_models()
343
 
344
+ query = st.text_input("Ask a question (e.g., ‘education in May 2024’, ‘Opposed bills on healthcare’):")
345
 
346
  if query:
347
+ year, month = parse_date_from_query(query)
348
+
349
+ # Filter by date if detected
350
+ df_filtered = df.copy()
351
+ if year:
352
+ df_filtered = df_filtered[df_filtered['status_date'].dt.year == year]
353
+ if month:
354
+ df_filtered = df_filtered[df_filtered['status_date'].dt.month == month]
 
 
 
 
355
 
356
+ if df_filtered.empty:
357
+ st.warning("No data found for the specified time period.")
358
  else:
359
+ # Compute embeddings for filtered data
360
+ texts = df_filtered['summary_insight'].tolist()
361
  embs = compute_embeddings(texts, _model=embed_model)
 
362
 
363
+ # Perform semantic search with higher threshold
364
+ results = semantic_search(query, embs, embed_model, threshold=0.7)
365
+
366
+ if not results:
367
  st.warning("No relevant insights found.")
368
  else:
369
+ st.subheader("Top Matching Insights")
370
+ collected_texts = []
371
+ query_keywords = query.lower().split()
372
 
373
+ for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
374
+ row = df_filtered.iloc[idx]
375
  date = row['status_date'].date()
376
+ cat = row['category_&_subcategory_standardized']
377
  goal = row['legislative_goal_standardized']
378
  intent = row['intent_standardized']
379
  stance = row['stance_standardized']
380
  trend_summary = row['llama_trend_summary'].strip()
381
+ summary_text = row['summary_insight']
382
+
383
+ highlighted_summary = highlight_keywords(summary_text, query_keywords)
384
+
385
+ st.markdown(f"- **Date:** {date} | **Score:** {score:.2f}")
386
+ st.markdown(f" - **Category:** {cat}")
387
+ st.markdown(f" - **Goal:** {goal}")
388
+ st.markdown(f" - **Intent:** {intent} | **Stance:** {stance}")
389
+ st.markdown(f" > **Trend Summary:** {trend_summary}")
390
+ st.markdown(f" > **Summary Insight:** {highlighted_summary}")
391
 
392
+ collected_texts.append(summary_text)
 
 
 
 
393
 
394
+ # RAG summary of matched results
395
+ st.subheader("RAG-Generated Summary")
396
+ rag_summary = rag_summarize(collected_texts, summarizer)
397
+ st.success(rag_summary)
398