tjl8 commited on
Commit
34fa400
Β·
verified Β·
1 Parent(s): 780f23d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -74
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # app.py
2
-
3
  import streamlit as st
4
  import pandas as pd
5
  import re
@@ -40,6 +38,8 @@ def semantic_search(query, embeddings, model, threshold=0.4):
40
 
41
  # RAG summarization
42
  def rag_summarize(group_texts, summarizer, top_k=5):
 
 
43
  vectorizer = TfidfVectorizer()
44
  tfidf_matrix = vectorizer.fit_transform(group_texts)
45
  mean_vector = tfidf_matrix.mean(axis=0).A
@@ -50,92 +50,60 @@ def rag_summarize(group_texts, summarizer, top_k=5):
50
  result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
  return result[0]['summary_text']
52
 
53
- # Extract trend level and year
54
- def extract_query_info(query):
55
- query = query.lower()
56
- trend_level = None
57
- if "monthly" in query:
58
- trend_level = "monthly"
59
- elif "quarterly" in query:
60
- trend_level = "quarterly"
61
- elif "yearly" in query or "annual" in query:
62
- trend_level = "yearly"
63
-
64
- year_match = re.search(r"(19|20)\d{2}", query)
65
- year = int(year_match.group()) if year_match else None
66
-
67
- return trend_level, year
68
-
69
- # Group and summarize
70
- def generate_trend_summary(df_filtered, level, summarizer):
71
- if level == "monthly":
72
- df_filtered['month'] = df_filtered['status_date'].dt.to_period('M').apply(lambda r: r.start_time)
73
- grouped = df_filtered.groupby('month')['summary_insight'].apply(list).reset_index()
74
- elif level == "quarterly":
75
- df_filtered['quarter'] = df_filtered['status_date'].dt.to_period('Q').apply(lambda r: r.start_time)
76
- grouped = df_filtered.groupby('quarter')['summary_insight'].apply(list).reset_index()
77
- elif level == "yearly":
78
- df_filtered['year'] = df_filtered['status_date'].dt.year
79
- grouped = df_filtered.groupby('year')['summary_insight'].apply(list).reset_index()
80
- else:
81
- return None # Should not happen
82
-
83
- summaries = []
84
- for i, row in grouped.iterrows():
85
- summary = rag_summarize(row['summary_insight'], summarizer)
86
- summaries.append((row[0], summary))
87
-
88
- return summaries
89
 
90
- # ---------------- Streamlit UI ----------------
91
- st.set_page_config(page_title="Illinois Legislative Explorer", layout="wide")
92
- st.title("πŸ“Š Illinois Bill Trends Explorer (Monthly, Quarterly, Yearly)")
93
- st.markdown("Ask a question like:")
94
- st.markdown("- *What are the monthly trends in 2024?*")
95
- st.markdown("- *Give me quarterly updates for 2023*")
96
- st.markdown("- *Yearly trends in equity bills 2022*")
97
- st.markdown("- *What are the bills about clean energy?*")
98
-
99
- # Load
100
  df = load_data()
101
  embed_model, summarizer = load_models()
102
 
103
- # User query
104
- query = st.text_input("πŸ” Ask your question:")
 
 
 
105
 
106
- if query:
107
- trend_level, year = extract_query_info(query)
 
108
 
109
- # Time-filtering logic
110
- if year:
111
- df_filtered = df[df['status_date'].dt.year == year]
112
- if df_filtered.empty:
113
- st.warning(f"No data found for the year {year}.")
114
- else:
115
- df_filtered = df
116
 
117
- # If trend is specified
118
- if trend_level in ["monthly", "quarterly", "yearly"]:
119
- st.info(f"Generating **{trend_level}** trend summaries" + (f" for {year}" if year else ""))
120
- trend_summaries = generate_trend_summary(df_filtered, trend_level, summarizer)
121
 
122
- if trend_summaries:
123
- for period, summary in trend_summaries:
124
- st.subheader(f"πŸ“… {period.strftime('%B %Y') if trend_level == 'monthly' else period.strftime('%Y Q%q') if trend_level == 'quarterly' else str(period)}")
125
- st.success(summary)
126
- else:
127
- st.warning("No trends found for the selected timeline.")
128
 
 
 
 
129
  else:
130
- # No trend level -> semantic search for individual bills
131
- st.info("No trend level mentioned β€” showing top relevant bills from the data.")
132
  embeddings = compute_embeddings(df_filtered["summary_insight"].tolist(), _model=embed_model)
133
- results = semantic_search(query, embeddings, embed_model)
134
 
135
  if not results:
136
- st.warning("No relevant results found.")
137
  else:
 
 
138
  for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
139
  row = df_filtered.iloc[idx]
140
- st.markdown(f"**πŸ“… Date:** {row['status_date'].date()} | **Score:** {score:.2f}")
 
 
 
141
  st.markdown(f"```{row['summary_insight'][:500]}```")
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import re
 
38
 
39
  # RAG summarization
40
  def rag_summarize(group_texts, summarizer, top_k=5):
41
+ if not group_texts:
42
+ return "No relevant content to summarize."
43
  vectorizer = TfidfVectorizer()
44
  tfidf_matrix = vectorizer.fit_transform(group_texts)
45
  mean_vector = tfidf_matrix.mean(axis=0).A
 
50
  result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
  return result[0]['summary_text']
52
 
53
+ # Streamlit UI
54
+ st.set_page_config(page_title="Illinois Trends Q&A", layout="wide")
55
+ st.title("πŸ“Š Illinois Trends Explorer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Load data & models
 
 
 
 
 
 
 
 
 
58
  df = load_data()
59
  embed_model, summarizer = load_models()
60
 
61
+ # Sidebar filters
62
+ st.sidebar.header("πŸ“… Filter Options")
63
+ years = sorted(df['status_date'].dt.year.unique(), reverse=True)
64
+ months = list(range(1, 13))
65
+ month_names = {i: datetime(2000, i, 1).strftime('%B') for i in months}
66
 
67
+ selected_year = st.sidebar.selectbox("Select Year", years)
68
+ selected_month = st.sidebar.selectbox("Select Month (optional)", [None] + months, format_func=lambda x: "All" if x is None else month_names[x])
69
+ selected_category = st.sidebar.selectbox("Select Category (optional)", ["All"] + sorted(df['category_&_subcategory_standardized'].dropna().unique()))
70
 
71
+ # Filter data
72
+ df_filtered = df[df['status_date'].dt.year == selected_year]
73
+ if selected_month:
74
+ df_filtered = df_filtered[df_filtered['status_date'].dt.month == selected_month]
75
+ if selected_category != "All":
76
+ df_filtered = df_filtered[df_filtered['category_&_subcategory_standardized'] == selected_category]
 
77
 
78
+ # Summary statistics
79
+ st.markdown(f"### πŸ“ˆ Top Categories in {month_names.get(selected_month, 'All Months')} {selected_year}")
80
+ top_cats = df_filtered['category_&_subcategory_standardized'].value_counts().head(5)
81
+ st.bar_chart(top_cats)
82
 
83
+ # Question input
84
+ query = st.text_input("πŸ” Ask your question about trends:")
 
 
 
 
85
 
86
+ if query:
87
+ if df_filtered.empty:
88
+ st.warning("No data available for this filter.")
89
  else:
 
 
90
  embeddings = compute_embeddings(df_filtered["summary_insight"].tolist(), _model=embed_model)
91
+ results = semantic_search(query, embeddings, embed_model, threshold=0.4)
92
 
93
  if not results:
94
+ st.warning("No relevant insights found.")
95
  else:
96
+ top_texts = []
97
+ st.subheader("πŸ”Ž Top Matching Insights:")
98
  for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
99
  row = df_filtered.iloc[idx]
100
+ st.markdown(f"**πŸ“… Date:** {row['status_date'].date()} | **πŸ”’ Score:** {score:.2f}")
101
+ st.markdown(f"**πŸ“Œ Title:** {row['title']}")
102
+ st.markdown(f"**🏷️ Category:** {row['category_&_subcategory_standardized']} | **🎯 Goal:** {row['legislative_goal_standardized']}")
103
+ st.markdown(f"**🧭 Intent:** {row['intent_standardized']} | **βš–οΈ Stance:** {row['stance_standardized']}")
104
  st.markdown(f"```{row['summary_insight'][:500]}```")
105
+ top_texts.append(row['summary_insight'])
106
+
107
+ st.subheader("🧠 RAG-Generated Summary:")
108
+ summary = rag_summarize(top_texts, summarizer, top_k=5)
109
+ st.success(summary)