tjl8 commited on
Commit
cfce783
Β·
verified Β·
1 Parent(s): 7c62b1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -73
app.py CHANGED
@@ -33,94 +33,99 @@ def compute_embeddings(texts, _model):
33
  # Semantic search
34
  def semantic_search(query, embeddings, model, threshold=0.4):
35
  query_embedding = model.encode([query])
36
- similarities = cosine_similarity(query_embedding, embeddings)[0]
37
- return [(i, score) for i, score in enumerate(similarities) if score > threshold]
38
 
39
  # RAG summarization
40
- def rag_summarize(group_texts, summarizer, top_k=5):
41
- if not group_texts:
42
  return "No relevant content to summarize."
43
- vectorizer = TfidfVectorizer()
44
- tfidf_matrix = vectorizer.fit_transform(group_texts)
45
- mean_vector = tfidf_matrix.mean(axis=0).A
46
- sim_scores = cosine_similarity(mean_vector, tfidf_matrix).flatten()
47
- top_indices = sim_scores.argsort()[::-1][:top_k]
48
- context = "\n".join([group_texts[i] for i in top_indices])
49
- prompt = "summarize: " + context[:1024]
50
- result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
- return result[0]['summary_text']
52
-
53
- # Extract month/year from query
54
- def extract_month_year(query):
55
- month_map = {
56
- "january": 1, "february": 2, "march": 3, "april": 4,
57
- "may": 5, "june": 6, "july": 7, "august": 8,
58
- "september": 9, "october": 10, "november": 11, "december": 12
59
- }
60
- query_lower = query.lower()
61
- month = next((month_map[m] for m in month_map if m in query_lower), None)
62
- year_match = re.search(r"(19|20)\d{2}", query)
63
- year = int(year_match.group()) if year_match else None
64
- return month, year
65
-
66
- # Extract category from query using simple keyword match
67
- def extract_category_from_query(query, categories):
68
- query = query.lower()
69
- for cat in categories:
70
- if pd.isna(cat):
71
- continue
72
- if any(word in query for word in cat.lower().split()):
73
  return cat
74
  return None
75
 
76
- # UI
77
- st.set_page_config(page_title="Illinois Legislative Q&A", layout="wide")
78
- st.title("πŸ“š Illinois Legislative Trends Q&A")
79
 
80
  df = load_data()
81
  embed_model, summarizer = load_models()
82
 
83
- query = st.text_input("πŸ” Ask a question about a topic, category, or time period (e.g., education in May 2024)")
84
 
85
  if query:
86
- month, year = extract_month_year(query)
87
- all_categories = df['category_&_subcategory_standardized'].unique()
88
- detected_category = extract_category_from_query(query, all_categories)
89
-
90
- df_filtered = df.copy()
91
-
92
- if detected_category:
93
- df_filtered = df_filtered[df_filtered['category_&_subcategory_standardized'] == detected_category]
94
- st.info(f"Filtering by category: **{detected_category}**")
95
-
96
- if year:
97
- df_filtered = df_filtered[df_filtered['status_date'].dt.year == year]
98
- if month:
99
- df_filtered = df_filtered[df_filtered['status_date'].dt.month == month]
100
- st.info(f"Filtering by time: **{datetime(year, month, 1).strftime('%B %Y')}**")
101
  else:
102
- st.info(f"Filtering by year: **{year}**")
103
 
104
- if df_filtered.empty:
105
- st.warning("No matching records found for your query.")
106
  else:
107
- embeddings = compute_embeddings(df_filtered['summary_insight'].tolist(), _model=embed_model)
108
- results = semantic_search(query, embeddings, embed_model, threshold=0.4)
 
109
 
110
- if not results:
111
- st.warning("No relevant summaries found.")
112
  else:
113
- st.subheader("πŸ“˜ Top Matching Insights:")
114
- top_texts = []
115
- for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
116
- row = df_filtered.iloc[idx]
117
- st.markdown(f"**πŸ—“οΈ Date:** {row['status_date'].date()} | **Score:** {score:.2f}")
118
- st.markdown(f"**Title:** {row['title']}")
119
- st.markdown(f"**Category:** {row['category_&_subcategory_standardized']} | **Goal:** {row['legislative_goal_standardized']}")
120
- st.markdown(f"**Intent:** {row['intent_standardized']} | **Stance:** {row['stance_standardized']}")
121
- st.markdown(f"```{row['summary_insight'][:500]}```")
122
- top_texts.append(row['summary_insight'])
123
-
124
- st.subheader("🧠 RAG-Generated Summary:")
125
- summary = rag_summarize(top_texts, summarizer, top_k=5)
 
 
 
 
 
 
 
 
 
126
  st.success(summary)
 
33
  # Semantic search
34
  def semantic_search(query, embeddings, model, threshold=0.4):
35
  query_embedding = model.encode([query])
36
+ sims = cosine_similarity(query_embedding, embeddings)[0]
37
+ return [(i, s) for i, s in enumerate(sims) if s > threshold]
38
 
39
  # RAG summarization
40
+ def rag_summarize(texts, summarizer, top_k=5):
41
+ if not texts:
42
  return "No relevant content to summarize."
43
+ vect = TfidfVectorizer()
44
+ m = vect.fit_transform(texts)
45
+ mean_vec = m.mean(axis=0).A
46
+ scores = cosine_similarity(mean_vec, m).flatten()
47
+ top_indices = scores.argsort()[::-1][:top_k]
48
+ ctx = "\n".join(texts[i] for i in top_indices)
49
+ prompt = "summarize: " + ctx[:1024]
50
+ out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
+ return out[0]['summary_text']
52
+
53
+ # Parse month/year
54
+ def extract_month_year(q):
55
+ month_map = {m: i for i, m in enumerate(
56
+ ["january", "february", "march", "april", "may", "june",
57
+ "july", "august", "september", "october", "november", "december"], 1)}
58
+ ql = q.lower()
59
+ mon = next((v for k, v in month_map.items() if k in ql), None)
60
+ ym = re.search(r"(19|20)\d{2}", q)
61
+ yr = int(ym.group()) if ym else None
62
+ return mon, yr
63
+
64
+ # Auto-detect category
65
+ def extract_category(q, cats):
66
+ ql = q.lower()
67
+ for cat in cats:
68
+ if pd.isna(cat): continue
69
+ if any(tok in ql for tok in cat.lower().split()):
 
 
 
70
  return cat
71
  return None
72
 
73
+ # Streamlit UI
74
+ st.set_page_config(page_title="IL Trends Q&A", layout="wide")
75
+ st.title("Illinois Legislative Trends Q&A")
76
 
77
  df = load_data()
78
  embed_model, summarizer = load_models()
79
 
80
+ query = st.text_input("Ask a question (e.g., β€˜education in May 2024’):")
81
 
82
  if query:
83
+ mon, yr = extract_month_year(query)
84
+ cats = df['category_&_subcategory_standardized'].unique()
85
+ cat = extract_category(query, cats)
86
+
87
+ df2 = df.copy()
88
+ if cat:
89
+ df2 = df2[df2['category_&_subcategory_standardized'] == cat]
90
+ st.info(f"πŸ”Ž Filtering by category: **{cat}**")
91
+ if yr:
92
+ df2 = df2[df2['status_date'].dt.year == yr]
93
+ if mon:
94
+ df2 = df2[df2['status_date'].dt.month == mon]
95
+ st.info(f"πŸ”Ž Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
 
 
96
  else:
97
+ st.info(f"πŸ”Ž Filtering by year: **{yr}**")
98
 
99
+ if df2.empty:
100
+ st.warning("No matching records found.")
101
  else:
102
+ texts = df2['summary_insight'].tolist()
103
+ embs = compute_embeddings(texts, _model=embed_model)
104
+ res = semantic_search(query, embs, embed_model)
105
 
106
+ if not res:
107
+ st.warning("No relevant insights found.")
108
  else:
109
+ st.subheader("Top Matching Insights")
110
+ collected = []
111
+
112
+ for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
113
+ row = df2.iloc[idx]
114
+ date = row['status_date'].date()
115
+ cat_std = row['category_&_subcategory_standardized']
116
+ goal = row['legislative_goal_standardized']
117
+ intent = row['intent_standardized']
118
+ stance = row['stance_standardized']
119
+ trend_summary = row['llama_trend_summary'].strip()
120
+
121
+ st.markdown(f"- **Date:** {date} | **Score:** {score:.2f}")
122
+ st.markdown(f" - **Category:** {cat_std}")
123
+ st.markdown(f" - **Goal:** {goal}")
124
+ st.markdown(f" - **Intent:** {intent} | **Stance:** {stance}")
125
+ st.markdown(f" > **Trend Summary:** {trend_summary}")
126
+
127
+ collected.append(row['summary_insight'])
128
+
129
+ st.subheader(" RAG-Generated Summary")
130
+ summary = rag_summarize(collected, summarizer)
131
  st.success(summary)