tjl8 commited on
Commit
e587b94
·
verified ·
1 Parent(s): cfce783

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -21
app.py CHANGED
@@ -1,3 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import re
@@ -7,36 +140,36 @@ from sklearn.metrics.pairwise import cosine_similarity
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from datetime import datetime
9
 
10
- # Load data
11
  @st.cache_data
12
  def load_data():
13
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
14
- df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
15
- df = df.dropna(subset=['status_date'])
16
- df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
17
  df["llama_insight"] = df["llama_insight"].fillna("")
18
- df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
19
  return df
20
 
21
- # Load models
22
  @st.cache_resource
23
  def load_models():
24
- embed_model = SentenceTransformer('all-MiniLM-L6-v2')
25
- summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
26
  return embed_model, summarizer
27
 
28
- # Compute embeddings
29
  @st.cache_data
30
  def compute_embeddings(texts, _model):
31
  return _model.encode(texts, show_progress_bar=True)
32
 
33
- # Semantic search
34
- def semantic_search(query, embeddings, model, threshold=0.4):
35
  query_embedding = model.encode([query])
36
  sims = cosine_similarity(query_embedding, embeddings)[0]
37
  return [(i, s) for i, s in enumerate(sims) if s > threshold]
38
 
39
- # RAG summarization
40
  def rag_summarize(texts, summarizer, top_k=5):
41
  if not texts:
42
  return "No relevant content to summarize."
@@ -44,13 +177,13 @@ def rag_summarize(texts, summarizer, top_k=5):
44
  m = vect.fit_transform(texts)
45
  mean_vec = m.mean(axis=0).A
46
  scores = cosine_similarity(mean_vec, m).flatten()
47
- top_indices = scores.argsort()[::-1][:top_k]
48
  ctx = "\n".join(texts[i] for i in top_indices)
49
  prompt = "summarize: " + ctx[:1024]
50
  out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
  return out[0]['summary_text']
52
 
53
- # Parse month/year
54
  def extract_month_year(q):
55
  month_map = {m: i for i, m in enumerate(
56
  ["january", "february", "march", "april", "may", "june",
@@ -61,7 +194,7 @@ def extract_month_year(q):
61
  yr = int(ym.group()) if ym else None
62
  return mon, yr
63
 
64
- # Auto-detect category
65
  def extract_category(q, cats):
66
  ql = q.lower()
67
  for cat in cats:
@@ -70,38 +203,52 @@ def extract_category(q, cats):
70
  return cat
71
  return None
72
 
73
- # Streamlit UI
74
  st.set_page_config(page_title="IL Trends Q&A", layout="wide")
75
  st.title("Illinois Legislative Trends Q&A")
76
 
 
77
  df = load_data()
78
  embed_model, summarizer = load_models()
79
 
80
- query = st.text_input("Ask a question (e.g., ‘education in May 2024’):")
 
81
 
82
  if query:
 
83
  mon, yr = extract_month_year(query)
84
  cats = df['category_&_subcategory_standardized'].unique()
85
  cat = extract_category(query, cats)
86
 
87
  df2 = df.copy()
 
 
 
 
 
 
 
88
  if cat:
89
  df2 = df2[df2['category_&_subcategory_standardized'] == cat]
90
  st.info(f"🔎 Filtering by category: **{cat}**")
 
 
91
  if yr:
92
  df2 = df2[df2['status_date'].dt.year == yr]
93
  if mon:
94
  df2 = df2[df2['status_date'].dt.month == mon]
95
- st.info(f"🔎 Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
96
  else:
97
- st.info(f"🔎 Filtering by year: **{yr}**")
98
 
 
99
  if df2.empty:
100
  st.warning("No matching records found.")
101
  else:
 
102
  texts = df2['summary_insight'].tolist()
103
  embs = compute_embeddings(texts, _model=embed_model)
104
- res = semantic_search(query, embs, embed_model)
105
 
106
  if not res:
107
  st.warning("No relevant insights found.")
@@ -109,6 +256,7 @@ if query:
109
  st.subheader("Top Matching Insights")
110
  collected = []
111
 
 
112
  for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
113
  row = df2.iloc[idx]
114
  date = row['status_date'].date()
@@ -126,6 +274,7 @@ if query:
126
 
127
  collected.append(row['summary_insight'])
128
 
129
- st.subheader(" RAG-Generated Summary")
 
130
  summary = rag_summarize(collected, summarizer)
131
  st.success(summary)
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # import re
4
+ # from sentence_transformers import SentenceTransformer
5
+ # from transformers import pipeline
6
+ # from sklearn.metrics.pairwise import cosine_similarity
7
+ # from sklearn.feature_extraction.text import TfidfVectorizer
8
+ # from datetime import datetime
9
+
10
+ # # Load data
11
+ # @st.cache_data
12
+ # def load_data():
13
+ # df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
14
+ # df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
15
+ # df = df.dropna(subset=['status_date'])
16
+ # df["llama_trend_summary"] = df["llama_trend_summary"].fillna("")
17
+ # df["llama_insight"] = df["llama_insight"].fillna("")
18
+ # df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
19
+ # return df
20
+
21
+ # # Load models
22
+ # @st.cache_resource
23
+ # def load_models():
24
+ # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
25
+ # summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
26
+ # return embed_model, summarizer
27
+
28
+ # # Compute embeddings
29
+ # @st.cache_data
30
+ # def compute_embeddings(texts, _model):
31
+ # return _model.encode(texts, show_progress_bar=True)
32
+
33
+ # # Semantic search
34
+ # def semantic_search(query, embeddings, model, threshold=0.4):
35
+ # query_embedding = model.encode([query])
36
+ # sims = cosine_similarity(query_embedding, embeddings)[0]
37
+ # return [(i, s) for i, s in enumerate(sims) if s > threshold]
38
+
39
+ # # RAG summarization
40
+ # def rag_summarize(texts, summarizer, top_k=5):
41
+ # if not texts:
42
+ # return "No relevant content to summarize."
43
+ # vect = TfidfVectorizer()
44
+ # m = vect.fit_transform(texts)
45
+ # mean_vec = m.mean(axis=0).A
46
+ # scores = cosine_similarity(mean_vec, m).flatten()
47
+ # top_indices = scores.argsort()[::-1][:top_k]
48
+ # ctx = "\n".join(texts[i] for i in top_indices)
49
+ # prompt = "summarize: " + ctx[:1024]
50
+ # out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
+ # return out[0]['summary_text']
52
+
53
+ # # Parse month/year
54
+ # def extract_month_year(q):
55
+ # month_map = {m: i for i, m in enumerate(
56
+ # ["january", "february", "march", "april", "may", "june",
57
+ # "july", "august", "september", "october", "november", "december"], 1)}
58
+ # ql = q.lower()
59
+ # mon = next((v for k, v in month_map.items() if k in ql), None)
60
+ # ym = re.search(r"(19|20)\d{2}", q)
61
+ # yr = int(ym.group()) if ym else None
62
+ # return mon, yr
63
+
64
+ # # Auto-detect category
65
+ # def extract_category(q, cats):
66
+ # ql = q.lower()
67
+ # for cat in cats:
68
+ # if pd.isna(cat): continue
69
+ # if any(tok in ql for tok in cat.lower().split()):
70
+ # return cat
71
+ # return None
72
+
73
+ # # Streamlit UI
74
+ # st.set_page_config(page_title="IL Trends Q&A", layout="wide")
75
+ # st.title("Illinois Legislative Trends Q&A")
76
+
77
+ # df = load_data()
78
+ # embed_model, summarizer = load_models()
79
+
80
+ # query = st.text_input("Ask a question (e.g., ‘education in May 2024’):")
81
+
82
+ # if query:
83
+ # mon, yr = extract_month_year(query)
84
+ # cats = df['category_&_subcategory_standardized'].unique()
85
+ # cat = extract_category(query, cats)
86
+
87
+ # df2 = df.copy()
88
+ # if cat:
89
+ # df2 = df2[df2['category_&_subcategory_standardized'] == cat]
90
+ # st.info(f"🔎 Filtering by category: **{cat}**")
91
+ # if yr:
92
+ # df2 = df2[df2['status_date'].dt.year == yr]
93
+ # if mon:
94
+ # df2 = df2[df2['status_date'].dt.month == mon]
95
+ # st.info(f"🔎 Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
96
+ # else:
97
+ # st.info(f"🔎 Filtering by year: **{yr}**")
98
+
99
+ # if df2.empty:
100
+ # st.warning("No matching records found.")
101
+ # else:
102
+ # texts = df2['summary_insight'].tolist()
103
+ # embs = compute_embeddings(texts, _model=embed_model)
104
+ # res = semantic_search(query, embs, embed_model)
105
+
106
+ # if not res:
107
+ # st.warning("No relevant insights found.")
108
+ # else:
109
+ # st.subheader("Top Matching Insights")
110
+ # collected = []
111
+
112
+ # for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
113
+ # row = df2.iloc[idx]
114
+ # date = row['status_date'].date()
115
+ # cat_std = row['category_&_subcategory_standardized']
116
+ # goal = row['legislative_goal_standardized']
117
+ # intent = row['intent_standardized']
118
+ # stance = row['stance_standardized']
119
+ # trend_summary = row['llama_trend_summary'].strip()
120
+
121
+ # st.markdown(f"- **Date:** {date} | **Score:** {score:.2f}")
122
+ # st.markdown(f" - **Category:** {cat_std}")
123
+ # st.markdown(f" - **Goal:** {goal}")
124
+ # st.markdown(f" - **Intent:** {intent} | **Stance:** {stance}")
125
+ # st.markdown(f" > **Trend Summary:** {trend_summary}")
126
+
127
+ # collected.append(row['summary_insight'])
128
+
129
+ # st.subheader(" RAG-Generated Summary")
130
+ # summary = rag_summarize(collected, summarizer)
131
+ # st.success(summary)
132
+
133
+
134
  import streamlit as st
135
  import pandas as pd
136
  import re
 
140
  from sklearn.feature_extraction.text import TfidfVectorizer
141
  from datetime import datetime
142
 
143
+ # Load and preprocess the dataset
144
  @st.cache_data
145
  def load_data():
146
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
147
+ df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce') # Convert dates
148
+ df = df.dropna(subset=['status_date']) # Remove rows with invalid dates
149
+ df["llama_trend_summary"] = df["llama_trend_summary"].fillna("") # Clean nulls
150
  df["llama_insight"] = df["llama_insight"].fillna("")
151
+ df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"] # Combine summaries
152
  return df
153
 
154
+ # Load sentence embedding model + summarization model
155
  @st.cache_resource
156
  def load_models():
157
+ embed_model = SentenceTransformer('all-MiniLM-L6-v2') # For semantic search
158
+ summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") # For final summary
159
  return embed_model, summarizer
160
 
161
+ # Generate embeddings from a list of texts
162
  @st.cache_data
163
  def compute_embeddings(texts, _model):
164
  return _model.encode(texts, show_progress_bar=True)
165
 
166
+ # Perform semantic search using cosine similarity
167
+ def semantic_search(query, embeddings, model, threshold=0.7): # Adjusted threshold to 0.7
168
  query_embedding = model.encode([query])
169
  sims = cosine_similarity(query_embedding, embeddings)[0]
170
  return [(i, s) for i, s in enumerate(sims) if s > threshold]
171
 
172
+ # Retrieve top matching texts and summarize them (RAG-like approach)
173
  def rag_summarize(texts, summarizer, top_k=5):
174
  if not texts:
175
  return "No relevant content to summarize."
 
177
  m = vect.fit_transform(texts)
178
  mean_vec = m.mean(axis=0).A
179
  scores = cosine_similarity(mean_vec, m).flatten()
180
+ top_indices = scores.argsort()[::-1][:top_k] # Pick top-k similar insights
181
  ctx = "\n".join(texts[i] for i in top_indices)
182
  prompt = "summarize: " + ctx[:1024]
183
  out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
184
  return out[0]['summary_text']
185
 
186
+ # Extract month and year from query (e.g., "May 2024")
187
  def extract_month_year(q):
188
  month_map = {m: i for i, m in enumerate(
189
  ["january", "february", "march", "april", "may", "june",
 
194
  yr = int(ym.group()) if ym else None
195
  return mon, yr
196
 
197
+ # Try to detect a category mentioned in the query
198
  def extract_category(q, cats):
199
  ql = q.lower()
200
  for cat in cats:
 
203
  return cat
204
  return None
205
 
206
+ # ---- Streamlit Interface ---- #
207
  st.set_page_config(page_title="IL Trends Q&A", layout="wide")
208
  st.title("Illinois Legislative Trends Q&A")
209
 
210
+ # Load the dataset and models
211
  df = load_data()
212
  embed_model, summarizer = load_models()
213
 
214
+ # User enters question
215
+ query = st.text_input("Ask a question (e.g., ‘trends in higher education in May 2024’):")
216
 
217
  if query:
218
+ # Extract date or category from user question
219
  mon, yr = extract_month_year(query)
220
  cats = df['category_&_subcategory_standardized'].unique()
221
  cat = extract_category(query, cats)
222
 
223
  df2 = df.copy()
224
+
225
+ # Filter if query includes "opposed"
226
+ if "opposed" in query.lower():
227
+ df2 = df2[df2['stance_standardized'].str.lower() == "opposed"]
228
+ st.info("🔎 Filtering for bills where stance is **opposed**")
229
+
230
+ # Filter by detected category
231
  if cat:
232
  df2 = df2[df2['category_&_subcategory_standardized'] == cat]
233
  st.info(f"🔎 Filtering by category: **{cat}**")
234
+
235
+ # Filter by year/month if detected
236
  if yr:
237
  df2 = df2[df2['status_date'].dt.year == yr]
238
  if mon:
239
  df2 = df2[df2['status_date'].dt.month == mon]
240
+ st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
241
  else:
242
+ st.info(f" Filtering by year: **{yr}**")
243
 
244
+ # If no data after filtering
245
  if df2.empty:
246
  st.warning("No matching records found.")
247
  else:
248
+ # Generate semantic matches
249
  texts = df2['summary_insight'].tolist()
250
  embs = compute_embeddings(texts, _model=embed_model)
251
+ res = semantic_search(query, embs, embed_model) # Uses threshold=0.7
252
 
253
  if not res:
254
  st.warning("No relevant insights found.")
 
256
  st.subheader("Top Matching Insights")
257
  collected = []
258
 
259
+ # Display top matches with metadata
260
  for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
261
  row = df2.iloc[idx]
262
  date = row['status_date'].date()
 
274
 
275
  collected.append(row['summary_insight'])
276
 
277
+ # RAG-generated summary from top matching insights
278
+ st.subheader("RAG-Generated Summary")
279
  summary = rag_summarize(collected, summarizer)
280
  st.success(summary)