tjl8 commited on
Commit
a0b33c4
·
verified ·
1 Parent(s): e413cee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -174
app.py CHANGED
@@ -1,148 +1,3 @@
1
- # import streamlit as st
2
- # import pandas as pd
3
- # import re
4
- # from sentence_transformers import SentenceTransformer
5
- # from transformers import pipeline
6
- # from sklearn.metrics.pairwise import cosine_similarity
7
- # from sklearn.feature_extraction.text import TfidfVectorizer
8
- # from datetime import datetime
9
-
10
- # def clean_text(text):
11
- # text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
12
- # text = re.sub(r"(?i)let me know if you'd like.*", "", text)
13
- # text = re.sub(r"(?i)trend summary[:]*", "", text)
14
- # text = re.sub(r"(?i)actionable insight[:]*", "", text)
15
- # return text.strip()
16
-
17
- # @st.cache_data
18
- # def load_data():
19
- # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
20
- # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
21
- # df = df.dropna(subset=['status_date'])
22
- # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
23
- # df["llama_insight"] = df["llama_insight"].fillna("")
24
- # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
25
- # return df
26
-
27
- # @st.cache_resource
28
- # def load_models():
29
- # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
30
- # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
31
- # return embed_model, summarizer
32
-
33
- # @st.cache_data
34
- # def compute_embeddings(texts, _model):
35
- # return _model.encode(texts, show_progress_bar=True)
36
-
37
- # def semantic_search(query, embeddings, model, threshold=0.5):
38
- # query_embedding = model.encode([query])
39
- # sims = cosine_similarity(query_embedding, embeddings)[0]
40
- # return [(i, s) for i, s in enumerate(sims) if s > threshold]
41
-
42
- # def rag_summarize(texts, summarizer, top_k=5):
43
- # if not texts:
44
- # return "No relevant content to summarize."
45
- # vect = TfidfVectorizer()
46
- # m = vect.fit_transform(texts)
47
- # mean_vec = m.mean(axis=0).A
48
- # scores = cosine_similarity(mean_vec, m).flatten()
49
- # top_indices = scores.argsort()[::-1][:top_k]
50
- # ctx = "\n".join(texts[i] for i in top_indices)
51
- # prompt = "summarize: " + ctx[:1024]
52
- # out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
53
- # return out[0]['summary_text']
54
-
55
- # def extract_month_year(q):
56
- # month_map = {m: i for i, m in enumerate(
57
- # ["january", "february", "march", "april", "may", "june",
58
- # "july", "august", "september", "october", "november", "december"], 1)}
59
- # ql = q.lower()
60
- # mon = next((v for k, v in month_map.items() if k in ql), None)
61
- # ym = re.search(r"(19|20)\d{2}", q)
62
- # yr = int(ym.group()) if ym else None
63
- # return mon, yr
64
-
65
- # def extract_topic_match(query, df):
66
- # query_lower = query.lower()
67
- # return df[
68
- # df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
69
- # df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
70
- # df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
71
- # df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
72
- # ]
73
-
74
- # st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
75
- # st.title("Illinois Legislative Trends Q&A")
76
- # st.markdown("Ask about trends in topics like higher education, funding, etc.")
77
-
78
- # df = load_data()
79
- # embed_model, summarizer = load_models()
80
-
81
- # query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
82
-
83
- # if query:
84
- # mon, yr = extract_month_year(query)
85
- # df2 = extract_topic_match(query, df)
86
-
87
- # if df2.empty:
88
- # df2 = df
89
- # if yr:
90
- # df2 = df2[df2['status_date'].dt.year == yr]
91
- # if mon:
92
- # df2 = df2[df2['status_date'].dt.month == mon]
93
- # st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
94
- # else:
95
- # st.info(f" Filtering by year: **{yr}**")
96
-
97
- # if df2.empty:
98
- # st.warning("No matching records found.")
99
- # else:
100
- # texts = df2['summary_insight'].tolist()
101
- # embs = compute_embeddings(texts, _model=embed_model)
102
- # res = semantic_search(query, embs, embed_model, threshold=0.5)
103
-
104
- # if not res:
105
- # st.warning("No relevant insights found.")
106
- # else:
107
- # st.subheader(" Top Matching Insights")
108
- # collected = []
109
-
110
- # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
111
- # row = df2.iloc[idx]
112
- # date = row['status_date'].date()
113
- # bill_number = row['bill_number']
114
- # full_url = row['url']
115
- # cat_std = row['Category & Subcategory']
116
- # goal = row['Legislative Goal']
117
- # impact = row['Policy Impact Areas']
118
- # provision = row['Key Provisions']
119
- # intent = row['Intent']
120
- # stance = row['Stance']
121
- # description = row['description']
122
- # summary = row['summary']
123
-
124
- # trend = clean_text(row['llama_trend_summary'])
125
- # insight = clean_text(row['llama_insight'])
126
-
127
- # st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
128
- # st.markdown(f"**Category:** {cat_std}")
129
- # st.markdown(f"**Goal:** {goal}")
130
- # st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
131
- # st.markdown(f"**Policy Impacy Area:** {impact}")
132
- # st.markdown(f"**Key Provision:** {provision}")
133
- # st.markdown(f"**Description:** {description}")
134
- # st.markdown(f"**Summary:** {summary}")
135
- # st.markdown(f"Trend Summary:{trend}")
136
- # st.markdown(f"Actionable Insight:{insight}")
137
- # st.markdown(f"[View Full Bill Text]({full_url})\n")
138
- # st.divider()
139
-
140
- # collected.append(row['summary_insight'])
141
-
142
- # st.subheader("RAG-Generated Overall Summary")
143
- # summary = rag_summarize(collected, summarizer)
144
- # st.success(summary)
145
-
146
  import streamlit as st
147
  import pandas as pd
148
  import re
@@ -166,11 +21,7 @@ def load_data():
166
  df = df.dropna(subset=['status_date'])
167
  df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
168
  df["llama_insight"] = df["llama_insight"].fillna("")
169
- df["description"] = df["description"].fillna("")
170
- df["summary"] = df["summary"].fillna("")
171
- df["Policy Impact Areas"] = df["Policy Impact Areas"].fillna("")
172
- df["Key Provisions"] = df["Key Provisions"].fillna("")
173
- # Optional: Add more preprocessing if needed
174
  return df
175
 
176
  @st.cache_resource
@@ -215,7 +66,6 @@ def extract_topic_match(query, df):
215
  query_lower = query.lower()
216
  return df[
217
  df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
218
- df['category_granular_standardized'].fillna('').str.lower().str.contains(query_lower) |
219
  df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
220
  df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
221
  df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
@@ -240,29 +90,21 @@ if query:
240
  df2 = df2[df2['status_date'].dt.year == yr]
241
  if mon:
242
  df2 = df2[df2['status_date'].dt.month == mon]
243
- st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
244
  else:
245
- st.info(f"Filtering by year: **{yr}**")
246
 
247
  if df2.empty:
248
  st.warning("No matching records found.")
249
  else:
250
- texts = df2.apply(lambda row: "\n".join([
251
- clean_text(str(row.get('llama_trend_summary', ''))),
252
- clean_text(str(row.get('llama_insight', ''))),
253
- str(row.get('summary', '')),
254
- str(row.get('description', '')),
255
- str(row.get('Policy Impact Areas', '')),
256
- str(row.get('Key Provisions', ''))
257
- ]), axis=1).tolist()
258
-
259
  embs = compute_embeddings(texts, _model=embed_model)
260
  res = semantic_search(query, embs, embed_model, threshold=0.5)
261
 
262
  if not res:
263
  st.warning("No relevant insights found.")
264
  else:
265
- st.subheader("Top Matching Insights")
266
  collected = []
267
 
268
  for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
@@ -278,6 +120,7 @@ if query:
278
  stance = row['Stance']
279
  description = row['description']
280
  summary = row['summary']
 
281
  trend = clean_text(row['llama_trend_summary'])
282
  insight = clean_text(row['llama_insight'])
283
 
@@ -285,24 +128,19 @@ if query:
285
  st.markdown(f"**Category:** {cat_std}")
286
  st.markdown(f"**Goal:** {goal}")
287
  st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
288
- st.markdown(f"**Policy Impact Area:** {impact}")
289
  st.markdown(f"**Key Provision:** {provision}")
290
  st.markdown(f"**Description:** {description}")
291
  st.markdown(f"**Summary:** {summary}")
292
- st.markdown(f"Trend Summary: {trend}")
293
- st.markdown(f"Actionable Insight: {insight}")
294
- st.markdown(f"[View Full Bill Text]({full_url})")
295
  st.divider()
296
 
297
- collected.append("\n".join([
298
- trend, insight, summary, description, impact, provision
299
- ]))
300
-
301
- with st.expander("View summarized insights context"):
302
- for i, t in enumerate(collected[:5]):
303
- st.markdown(f"**Insight #{i+1}:**\n{t}\n")
304
 
305
  st.subheader("RAG-Generated Overall Summary")
306
  summary = rag_summarize(collected, summarizer)
307
  st.success(summary)
308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import re
 
21
  df = df.dropna(subset=['status_date'])
22
  df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
23
  df["llama_insight"] = df["llama_insight"].fillna("")
24
+ df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
 
 
 
 
25
  return df
26
 
27
  @st.cache_resource
 
66
  query_lower = query.lower()
67
  return df[
68
  df['category_&_subcategory_standardized'].fillna('').str.lower().str.contains(query_lower) |
 
69
  df['intent_standardized'].fillna('').str.lower().str.contains(query_lower) |
70
  df['legislative_goal_standardized'].fillna('').str.lower().str.contains(query_lower) |
71
  df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
 
90
  df2 = df2[df2['status_date'].dt.year == yr]
91
  if mon:
92
  df2 = df2[df2['status_date'].dt.month == mon]
93
+ st.info(f" Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
94
  else:
95
+ st.info(f" Filtering by year: **{yr}**")
96
 
97
  if df2.empty:
98
  st.warning("No matching records found.")
99
  else:
100
+ texts = df2['summary_insight'].tolist()
 
 
 
 
 
 
 
 
101
  embs = compute_embeddings(texts, _model=embed_model)
102
  res = semantic_search(query, embs, embed_model, threshold=0.5)
103
 
104
  if not res:
105
  st.warning("No relevant insights found.")
106
  else:
107
+ st.subheader(" Top Matching Insights")
108
  collected = []
109
 
110
  for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
 
120
  stance = row['Stance']
121
  description = row['description']
122
  summary = row['summary']
123
+
124
  trend = clean_text(row['llama_trend_summary'])
125
  insight = clean_text(row['llama_insight'])
126
 
 
128
  st.markdown(f"**Category:** {cat_std}")
129
  st.markdown(f"**Goal:** {goal}")
130
  st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
131
+ st.markdown(f"**Policy Impacy Area:** {impact}")
132
  st.markdown(f"**Key Provision:** {provision}")
133
  st.markdown(f"**Description:** {description}")
134
  st.markdown(f"**Summary:** {summary}")
135
+ st.markdown(f"Trend Summary:{trend}")
136
+ st.markdown(f"Actionable Insight:{insight}")
137
+ st.markdown(f"[View Full Bill Text]({full_url})\n")
138
  st.divider()
139
 
140
+ collected.append(row['summary_insight'])
 
 
 
 
 
 
141
 
142
  st.subheader("RAG-Generated Overall Summary")
143
  summary = rag_summarize(collected, summarizer)
144
  st.success(summary)
145
 
146
+