tjl8 commited on
Commit
7c62b1c
Β·
verified Β·
1 Parent(s): 34fa400

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -34
app.py CHANGED
@@ -7,7 +7,7 @@ from sklearn.metrics.pairwise import cosine_similarity
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from datetime import datetime
9
 
10
- # Load dataset
11
  @st.cache_data
12
  def load_data():
13
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
@@ -50,57 +50,74 @@ def rag_summarize(group_texts, summarizer, top_k=5):
50
  result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
  return result[0]['summary_text']
52
 
53
- # Streamlit UI
54
- st.set_page_config(page_title="Illinois Trends Q&A", layout="wide")
55
- st.title("πŸ“Š Illinois Trends Explorer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- # Load data & models
58
  df = load_data()
59
  embed_model, summarizer = load_models()
60
 
61
- # Sidebar filters
62
- st.sidebar.header("πŸ“… Filter Options")
63
- years = sorted(df['status_date'].dt.year.unique(), reverse=True)
64
- months = list(range(1, 13))
65
- month_names = {i: datetime(2000, i, 1).strftime('%B') for i in months}
66
 
67
- selected_year = st.sidebar.selectbox("Select Year", years)
68
- selected_month = st.sidebar.selectbox("Select Month (optional)", [None] + months, format_func=lambda x: "All" if x is None else month_names[x])
69
- selected_category = st.sidebar.selectbox("Select Category (optional)", ["All"] + sorted(df['category_&_subcategory_standardized'].dropna().unique()))
 
70
 
71
- # Filter data
72
- df_filtered = df[df['status_date'].dt.year == selected_year]
73
- if selected_month:
74
- df_filtered = df_filtered[df_filtered['status_date'].dt.month == selected_month]
75
- if selected_category != "All":
76
- df_filtered = df_filtered[df_filtered['category_&_subcategory_standardized'] == selected_category]
77
 
78
- # Summary statistics
79
- st.markdown(f"### πŸ“ˆ Top Categories in {month_names.get(selected_month, 'All Months')} {selected_year}")
80
- top_cats = df_filtered['category_&_subcategory_standardized'].value_counts().head(5)
81
- st.bar_chart(top_cats)
82
 
83
- # Question input
84
- query = st.text_input("πŸ” Ask your question about trends:")
 
 
 
 
 
85
 
86
- if query:
87
  if df_filtered.empty:
88
- st.warning("No data available for this filter.")
89
  else:
90
- embeddings = compute_embeddings(df_filtered["summary_insight"].tolist(), _model=embed_model)
91
  results = semantic_search(query, embeddings, embed_model, threshold=0.4)
92
 
93
  if not results:
94
- st.warning("No relevant insights found.")
95
  else:
 
96
  top_texts = []
97
- st.subheader("πŸ”Ž Top Matching Insights:")
98
  for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
99
  row = df_filtered.iloc[idx]
100
- st.markdown(f"**πŸ“… Date:** {row['status_date'].date()} | **πŸ”’ Score:** {score:.2f}")
101
- st.markdown(f"**πŸ“Œ Title:** {row['title']}")
102
- st.markdown(f"**🏷️ Category:** {row['category_&_subcategory_standardized']} | **🎯 Goal:** {row['legislative_goal_standardized']}")
103
- st.markdown(f"**🧭 Intent:** {row['intent_standardized']} | **���️ Stance:** {row['stance_standardized']}")
104
  st.markdown(f"```{row['summary_insight'][:500]}```")
105
  top_texts.append(row['summary_insight'])
106
 
 
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from datetime import datetime
9
 
10
+ # Load data
11
  @st.cache_data
12
  def load_data():
13
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
 
50
  result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
51
  return result[0]['summary_text']
52
 
53
+ # Extract month/year from query
54
+ def extract_month_year(query):
55
+ month_map = {
56
+ "january": 1, "february": 2, "march": 3, "april": 4,
57
+ "may": 5, "june": 6, "july": 7, "august": 8,
58
+ "september": 9, "october": 10, "november": 11, "december": 12
59
+ }
60
+ query_lower = query.lower()
61
+ month = next((month_map[m] for m in month_map if m in query_lower), None)
62
+ year_match = re.search(r"(19|20)\d{2}", query)
63
+ year = int(year_match.group()) if year_match else None
64
+ return month, year
65
+
66
+ # Extract category from query using simple keyword match
67
+ def extract_category_from_query(query, categories):
68
+ query = query.lower()
69
+ for cat in categories:
70
+ if pd.isna(cat):
71
+ continue
72
+ if any(word in query for word in cat.lower().split()):
73
+ return cat
74
+ return None
75
+
76
+ # UI
77
+ st.set_page_config(page_title="Illinois Legislative Q&A", layout="wide")
78
+ st.title("πŸ“š Illinois Legislative Trends Q&A")
79
 
 
80
  df = load_data()
81
  embed_model, summarizer = load_models()
82
 
83
+ query = st.text_input("πŸ” Ask a question about a topic, category, or time period (e.g., education in May 2024)")
 
 
 
 
84
 
85
+ if query:
86
+ month, year = extract_month_year(query)
87
+ all_categories = df['category_&_subcategory_standardized'].unique()
88
+ detected_category = extract_category_from_query(query, all_categories)
89
 
90
+ df_filtered = df.copy()
 
 
 
 
 
91
 
92
+ if detected_category:
93
+ df_filtered = df_filtered[df_filtered['category_&_subcategory_standardized'] == detected_category]
94
+ st.info(f"Filtering by category: **{detected_category}**")
 
95
 
96
+ if year:
97
+ df_filtered = df_filtered[df_filtered['status_date'].dt.year == year]
98
+ if month:
99
+ df_filtered = df_filtered[df_filtered['status_date'].dt.month == month]
100
+ st.info(f"Filtering by time: **{datetime(year, month, 1).strftime('%B %Y')}**")
101
+ else:
102
+ st.info(f"Filtering by year: **{year}**")
103
 
 
104
  if df_filtered.empty:
105
+ st.warning("No matching records found for your query.")
106
  else:
107
+ embeddings = compute_embeddings(df_filtered['summary_insight'].tolist(), _model=embed_model)
108
  results = semantic_search(query, embeddings, embed_model, threshold=0.4)
109
 
110
  if not results:
111
+ st.warning("No relevant summaries found.")
112
  else:
113
+ st.subheader("πŸ“˜ Top Matching Insights:")
114
  top_texts = []
 
115
  for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
116
  row = df_filtered.iloc[idx]
117
+ st.markdown(f"**πŸ—“οΈ Date:** {row['status_date'].date()} | **Score:** {score:.2f}")
118
+ st.markdown(f"**Title:** {row['title']}")
119
+ st.markdown(f"**Category:** {row['category_&_subcategory_standardized']} | **Goal:** {row['legislative_goal_standardized']}")
120
+ st.markdown(f"**Intent:** {row['intent_standardized']} | **Stance:** {row['stance_standardized']}")
121
  st.markdown(f"```{row['summary_insight'][:500]}```")
122
  top_texts.append(row['summary_insight'])
123