Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,7 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
| 7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
from datetime import datetime
|
| 9 |
|
| 10 |
-
# Load
|
| 11 |
@st.cache_data
|
| 12 |
def load_data():
|
| 13 |
df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
|
|
@@ -50,57 +50,74 @@ def rag_summarize(group_texts, summarizer, top_k=5):
|
|
| 50 |
result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
|
| 51 |
return result[0]['summary_text']
|
| 52 |
|
| 53 |
-
#
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
# Load data & models
|
| 58 |
df = load_data()
|
| 59 |
embed_model, summarizer = load_models()
|
| 60 |
|
| 61 |
-
|
| 62 |
-
st.sidebar.header("π
Filter Options")
|
| 63 |
-
years = sorted(df['status_date'].dt.year.unique(), reverse=True)
|
| 64 |
-
months = list(range(1, 13))
|
| 65 |
-
month_names = {i: datetime(2000, i, 1).strftime('%B') for i in months}
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
df_filtered = df[df['status_date'].dt.year == selected_year]
|
| 73 |
-
if selected_month:
|
| 74 |
-
df_filtered = df_filtered[df_filtered['status_date'].dt.month == selected_month]
|
| 75 |
-
if selected_category != "All":
|
| 76 |
-
df_filtered = df_filtered[df_filtered['category_&_subcategory_standardized'] == selected_category]
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
st.bar_chart(top_cats)
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
if query:
|
| 87 |
if df_filtered.empty:
|
| 88 |
-
st.warning("No
|
| 89 |
else:
|
| 90 |
-
embeddings = compute_embeddings(df_filtered[
|
| 91 |
results = semantic_search(query, embeddings, embed_model, threshold=0.4)
|
| 92 |
|
| 93 |
if not results:
|
| 94 |
-
st.warning("No relevant
|
| 95 |
else:
|
|
|
|
| 96 |
top_texts = []
|
| 97 |
-
st.subheader("π Top Matching Insights:")
|
| 98 |
for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
|
| 99 |
row = df_filtered.iloc[idx]
|
| 100 |
-
st.markdown(f"**
|
| 101 |
-
st.markdown(f"**
|
| 102 |
-
st.markdown(f"**
|
| 103 |
-
st.markdown(f"**
|
| 104 |
st.markdown(f"```{row['summary_insight'][:500]}```")
|
| 105 |
top_texts.append(row['summary_insight'])
|
| 106 |
|
|
|
|
| 7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
from datetime import datetime
|
| 9 |
|
| 10 |
+
# Load data
|
| 11 |
@st.cache_data
|
| 12 |
def load_data():
|
| 13 |
df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
|
|
|
|
| 50 |
result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
|
| 51 |
return result[0]['summary_text']
|
| 52 |
|
| 53 |
+
# Extract month/year from query
|
| 54 |
+
def extract_month_year(query):
|
| 55 |
+
month_map = {
|
| 56 |
+
"january": 1, "february": 2, "march": 3, "april": 4,
|
| 57 |
+
"may": 5, "june": 6, "july": 7, "august": 8,
|
| 58 |
+
"september": 9, "october": 10, "november": 11, "december": 12
|
| 59 |
+
}
|
| 60 |
+
query_lower = query.lower()
|
| 61 |
+
month = next((month_map[m] for m in month_map if m in query_lower), None)
|
| 62 |
+
year_match = re.search(r"(19|20)\d{2}", query)
|
| 63 |
+
year = int(year_match.group()) if year_match else None
|
| 64 |
+
return month, year
|
| 65 |
+
|
| 66 |
+
# Extract category from query using simple keyword match
|
| 67 |
+
def extract_category_from_query(query, categories):
|
| 68 |
+
query = query.lower()
|
| 69 |
+
for cat in categories:
|
| 70 |
+
if pd.isna(cat):
|
| 71 |
+
continue
|
| 72 |
+
if any(word in query for word in cat.lower().split()):
|
| 73 |
+
return cat
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
# UI
|
| 77 |
+
st.set_page_config(page_title="Illinois Legislative Q&A", layout="wide")
|
| 78 |
+
st.title("π Illinois Legislative Trends Q&A")
|
| 79 |
|
|
|
|
| 80 |
df = load_data()
|
| 81 |
embed_model, summarizer = load_models()
|
| 82 |
|
| 83 |
+
query = st.text_input("π Ask a question about a topic, category, or time period (e.g., education in May 2024)")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
if query:
|
| 86 |
+
month, year = extract_month_year(query)
|
| 87 |
+
all_categories = df['category_&_subcategory_standardized'].unique()
|
| 88 |
+
detected_category = extract_category_from_query(query, all_categories)
|
| 89 |
|
| 90 |
+
df_filtered = df.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
if detected_category:
|
| 93 |
+
df_filtered = df_filtered[df_filtered['category_&_subcategory_standardized'] == detected_category]
|
| 94 |
+
st.info(f"Filtering by category: **{detected_category}**")
|
|
|
|
| 95 |
|
| 96 |
+
if year:
|
| 97 |
+
df_filtered = df_filtered[df_filtered['status_date'].dt.year == year]
|
| 98 |
+
if month:
|
| 99 |
+
df_filtered = df_filtered[df_filtered['status_date'].dt.month == month]
|
| 100 |
+
st.info(f"Filtering by time: **{datetime(year, month, 1).strftime('%B %Y')}**")
|
| 101 |
+
else:
|
| 102 |
+
st.info(f"Filtering by year: **{year}**")
|
| 103 |
|
|
|
|
| 104 |
if df_filtered.empty:
|
| 105 |
+
st.warning("No matching records found for your query.")
|
| 106 |
else:
|
| 107 |
+
embeddings = compute_embeddings(df_filtered['summary_insight'].tolist(), _model=embed_model)
|
| 108 |
results = semantic_search(query, embeddings, embed_model, threshold=0.4)
|
| 109 |
|
| 110 |
if not results:
|
| 111 |
+
st.warning("No relevant summaries found.")
|
| 112 |
else:
|
| 113 |
+
st.subheader("π Top Matching Insights:")
|
| 114 |
top_texts = []
|
|
|
|
| 115 |
for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
|
| 116 |
row = df_filtered.iloc[idx]
|
| 117 |
+
st.markdown(f"**ποΈ Date:** {row['status_date'].date()} | **Score:** {score:.2f}")
|
| 118 |
+
st.markdown(f"**Title:** {row['title']}")
|
| 119 |
+
st.markdown(f"**Category:** {row['category_&_subcategory_standardized']} | **Goal:** {row['legislative_goal_standardized']}")
|
| 120 |
+
st.markdown(f"**Intent:** {row['intent_standardized']} | **Stance:** {row['stance_standardized']}")
|
| 121 |
st.markdown(f"```{row['summary_insight'][:500]}```")
|
| 122 |
top_texts.append(row['summary_insight'])
|
| 123 |
|