tjl8 commited on
Commit
e413cee
·
verified ·
1 Parent(s): b37c858

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -12
app.py CHANGED
@@ -1,3 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import re
@@ -21,7 +166,11 @@ def load_data():
21
  df = df.dropna(subset=['status_date'])
22
  df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
23
  df["llama_insight"] = df["llama_insight"].fillna("")
24
- df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
 
 
 
 
25
  return df
26
 
27
  @st.cache_resource
@@ -66,6 +215,7 @@ def extract_topic_match(query, df):
66
  query_lower = query.lower()
67
  return df[
68
  df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
 
69
  df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
70
  df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
71
  df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
@@ -90,21 +240,29 @@ if query:
90
  df2 = df2[df2['status_date'].dt.year == yr]
91
  if mon:
92
  df2 = df2[df2['status_date'].dt.month == mon]
93
- st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
94
  else:
95
- st.info(f" Filtering by year: **{yr}**")
96
 
97
  if df2.empty:
98
  st.warning("No matching records found.")
99
  else:
100
- texts = df2['summary_insight'].tolist()
 
 
 
 
 
 
 
 
101
  embs = compute_embeddings(texts, _model=embed_model)
102
  res = semantic_search(query, embs, embed_model, threshold=0.5)
103
 
104
  if not res:
105
  st.warning("No relevant insights found.")
106
  else:
107
- st.subheader(" Top Matching Insights")
108
  collected = []
109
 
110
  for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
@@ -120,7 +278,6 @@ if query:
120
  stance = row['Stance']
121
  description = row['description']
122
  summary = row['summary']
123
-
124
  trend = clean_text(row['llama_trend_summary'])
125
  insight = clean_text(row['llama_insight'])
126
 
@@ -128,19 +285,24 @@ if query:
128
  st.markdown(f"**Category:** {cat_std}")
129
  st.markdown(f"**Goal:** {goal}")
130
  st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
131
- st.markdown(f"**Policy Impacy Area:** {impact}")
132
  st.markdown(f"**Key Provision:** {provision}")
133
  st.markdown(f"**Description:** {description}")
134
  st.markdown(f"**Summary:** {summary}")
135
- st.markdown(f"Trend Summary:{trend}")
136
- st.markdown(f"Actionable Insight:{insight}")
137
- st.markdown(f"[View Full Bill Text]({full_url})\n")
138
  st.divider()
139
 
140
- collected.append(row['summary_insight'])
 
 
 
 
 
 
141
 
142
  st.subheader("RAG-Generated Overall Summary")
143
  summary = rag_summarize(collected, summarizer)
144
  st.success(summary)
145
 
146
-
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # import re
4
+ # from sentence_transformers import SentenceTransformer
5
+ # from transformers import pipeline
6
+ # from sklearn.metrics.pairwise import cosine_similarity
7
+ # from sklearn.feature_extraction.text import TfidfVectorizer
8
+ # from datetime import datetime
9
+
10
+ # def clean_text(text):
11
+ # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
12
+ # text = re.sub(r"(?i)let me know if you'd like.*", "", text)
13
+ # text = re.sub(r"(?i)trend summary[:]*", "", text)
14
+ # text = re.sub(r"(?i)actionable insight[:]*", "", text)
15
+ # return text.strip()
16
+
17
+ # @st.cache_data
18
+ # def load_data():
19
+ # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
20
+ # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
21
+ # df = df.dropna(subset=['status_date'])
22
+ # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
23
+ # df["llama_insight"] = df["llama_insight"].fillna("")
24
+ # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
25
+ # return df
26
+
27
+ # @st.cache_resource
28
+ # def load_models():
29
+ # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
30
+ # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
31
+ # return embed_model, summarizer
32
+
33
+ # @st.cache_data
34
+ # def compute_embeddings(texts, _model):
35
+ # return _model.encode(texts, show_progress_bar=True)
36
+
37
+ # def semantic_search(query, embeddings, model, threshold=0.5):
38
+ # query_embedding = model.encode([query])
39
+ # sims = cosine_similarity(query_embedding, embeddings)[0]
40
+ # return [(i, s) for i, s in enumerate(sims) if s > threshold]
41
+
42
+ # def rag_summarize(texts, summarizer, top_k=5):
43
+ # if not texts:
44
+ # return "No relevant content to summarize."
45
+ # vect = TfidfVectorizer()
46
+ # m = vect.fit_transform(texts)
47
+ # mean_vec = m.mean(axis=0).A
48
+ # scores = cosine_similarity(mean_vec, m).flatten()
49
+ # top_indices = scores.argsort()[::-1][:top_k]
50
+ # ctx = "\n".join(texts[i] for i in top_indices)
51
+ # prompt = "summarize: " + ctx[:1024]
52
+ # out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
53
+ # return out[0]['summary_text']
54
+
55
+ # def extract_month_year(q):
56
+ # month_map = {m: i for i, m in enumerate(
57
+ # ["january", "february", "march", "april", "may", "june",
58
+ # "july", "august", "september", "october", "november", "december"], 1)}
59
+ # ql = q.lower()
60
+ # mon = next((v for k, v in month_map.items() if k in ql), None)
61
+ # ym = re.search(r"(19|20)\d{2}", q)
62
+ # yr = int(ym.group()) if ym else None
63
+ # return mon, yr
64
+
65
+ # def extract_topic_match(query, df):
66
+ # query_lower = query.lower()
67
+ # return df[
68
+ # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
69
+ # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
70
+ # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
71
+ # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
72
+ # ]
73
+
74
+ # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
75
+ # st.title("Illinois Legislative Trends Q&A")
76
+ # st.markdown("Ask about trends in topics like higher education, funding, etc.")
77
+
78
+ # df = load_data()
79
+ # embed_model, summarizer = load_models()
80
+
81
+ # query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
82
+
83
+ # if query:
84
+ # mon, yr = extract_month_year(query)
85
+ # df2 = extract_topic_match(query, df)
86
+
87
+ # if df2.empty:
88
+ # df2 = df
89
+ # if yr:
90
+ # df2 = df2[df2['status_date'].dt.year == yr]
91
+ # if mon:
92
+ # df2 = df2[df2['status_date'].dt.month == mon]
93
+ # st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
94
+ # else:
95
+ # st.info(f" Filtering by year: **{yr}**")
96
+
97
+ # if df2.empty:
98
+ # st.warning("No matching records found.")
99
+ # else:
100
+ # texts = df2['summary_insight'].tolist()
101
+ # embs = compute_embeddings(texts, _model=embed_model)
102
+ # res = semantic_search(query, embs, embed_model, threshold=0.5)
103
+
104
+ # if not res:
105
+ # st.warning("No relevant insights found.")
106
+ # else:
107
+ # st.subheader(" Top Matching Insights")
108
+ # collected = []
109
+
110
+ # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
111
+ # row = df2.iloc[idx]
112
+ # date = row['status_date'].date()
113
+ # bill_number = row['bill_number']
114
+ # full_url = row['url']
115
+ # cat_std = row['Category & Subcategory']
116
+ # goal = row['Legislative Goal']
117
+ # impact = row['Policy Impact Areas']
118
+ # provision = row['Key Provisions']
119
+ # intent = row['Intent']
120
+ # stance = row['Stance']
121
+ # description = row['description']
122
+ # summary = row['summary']
123
+
124
+ # trend = clean_text(row['llama_trend_summary'])
125
+ # insight = clean_text(row['llama_insight'])
126
+
127
+ # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
128
+ # st.markdown(f"**Category:** {cat_std}")
129
+ # st.markdown(f"**Goal:** {goal}")
130
+ # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
131
+ # st.markdown(f"**Policy Impacy Area:** {impact}")
132
+ # st.markdown(f"**Key Provision:** {provision}")
133
+ # st.markdown(f"**Description:** {description}")
134
+ # st.markdown(f"**Summary:** {summary}")
135
+ # st.markdown(f"Trend Summary:{trend}")
136
+ # st.markdown(f"Actionable Insight:{insight}")
137
+ # st.markdown(f"[View Full Bill Text]({full_url})\n")
138
+ # st.divider()
139
+
140
+ # collected.append(row['summary_insight'])
141
+
142
+ # st.subheader("RAG-Generated Overall Summary")
143
+ # summary = rag_summarize(collected, summarizer)
144
+ # st.success(summary)
145
+
146
  import streamlit as st
147
  import pandas as pd
148
  import re
 
166
  df = df.dropna(subset=['status_date'])
167
  df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
168
  df["llama_insight"] = df["llama_insight"].fillna("")
169
+ df["description"] = df["description"].fillna("")
170
+ df["summary"] = df["summary"].fillna("")
171
+ df["Policy Impact Areas"] = df["Policy Impact Areas"].fillna("")
172
+ df["Key Provisions"] = df["Key Provisions"].fillna("")
173
+ # Optional: Add more preprocessing if needed
174
  return df
175
 
176
  @st.cache_resource
 
215
  query_lower = query.lower()
216
  return df[
217
  df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
218
+ df['category_granular_standardized'].fillna('').str.lower().str.contains(query_lower) |
219
  df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
220
  df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
221
  df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
 
240
  df2 = df2[df2['status_date'].dt.year == yr]
241
  if mon:
242
  df2 = df2[df2['status_date'].dt.month == mon]
243
+ st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
244
  else:
245
+ st.info(f"Filtering by year: **{yr}**")
246
 
247
  if df2.empty:
248
  st.warning("No matching records found.")
249
  else:
250
+ texts = df2.apply(lambda row: "\n".join([
251
+ clean_text(str(row.get('llama_trend_summary', ''))),
252
+ clean_text(str(row.get('llama_insight', ''))),
253
+ str(row.get('summary', '')),
254
+ str(row.get('description', '')),
255
+ str(row.get('Policy Impact Areas', '')),
256
+ str(row.get('Key Provisions', ''))
257
+ ]), axis=1).tolist()
258
+
259
  embs = compute_embeddings(texts, _model=embed_model)
260
  res = semantic_search(query, embs, embed_model, threshold=0.5)
261
 
262
  if not res:
263
  st.warning("No relevant insights found.")
264
  else:
265
+ st.subheader("Top Matching Insights")
266
  collected = []
267
 
268
  for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
 
278
  stance = row['Stance']
279
  description = row['description']
280
  summary = row['summary']
 
281
  trend = clean_text(row['llama_trend_summary'])
282
  insight = clean_text(row['llama_insight'])
283
 
 
285
  st.markdown(f"**Category:** {cat_std}")
286
  st.markdown(f"**Goal:** {goal}")
287
  st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
288
+ st.markdown(f"**Policy Impact Area:** {impact}")
289
  st.markdown(f"**Key Provision:** {provision}")
290
  st.markdown(f"**Description:** {description}")
291
  st.markdown(f"**Summary:** {summary}")
292
+ st.markdown(f"Trend Summary: {trend}")
293
+ st.markdown(f"Actionable Insight: {insight}")
294
+ st.markdown(f"[View Full Bill Text]({full_url})")
295
  st.divider()
296
 
297
+ collected.append("\n".join([
298
+ trend, insight, summary, description, impact, provision
299
+ ]))
300
+
301
+ with st.expander("View summarized insights context"):
302
+ for i, t in enumerate(collected[:5]):
303
+ st.markdown(f"**Insight #{i+1}:**\n{t}\n")
304
 
305
  st.subheader("RAG-Generated Overall Summary")
306
  summary = rag_summarize(collected, summarizer)
307
  st.success(summary)
308