Spaces:

Legislation
/

RAG

Runtime error

App Files Files Community

tjl8 commited on Jul 8, 2025

Commit

cfce783

verified ·

1 Parent(s): 7c62b1c

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -73

app.py CHANGED Viewed

@@ -33,94 +33,99 @@ def compute_embeddings(texts, _model):
 # Semantic search
 def semantic_search(query, embeddings, model, threshold=0.4):
     query_embedding = model.encode([query])
-    similarities = cosine_similarity(query_embedding, embeddings)[0]
-    return [(i, score) for i, score in enumerate(similarities) if score > threshold]
 # RAG summarization
-def rag_summarize(group_texts, summarizer, top_k=5):
-    if not group_texts:
         return "No relevant content to summarize."
-    vectorizer = TfidfVectorizer()
-    tfidf_matrix = vectorizer.fit_transform(group_texts)
-    mean_vector = tfidf_matrix.mean(axis=0).A
-    sim_scores = cosine_similarity(mean_vector, tfidf_matrix).flatten()
-    top_indices = sim_scores.argsort()[::-1][:top_k]
-    context = "\n".join([group_texts[i] for i in top_indices])
-    prompt = "summarize: " + context[:1024]
-    result = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
-    return result[0]['summary_text']
-# Extract month/year from query
-def extract_month_year(query):
-    month_map = {
-        "january": 1, "february": 2, "march": 3, "april": 4,
-        "may": 5, "june": 6, "july": 7, "august": 8,
-        "september": 9, "october": 10, "november": 11, "december": 12
-    }
-    query_lower = query.lower()
-    month = next((month_map[m] for m in month_map if m in query_lower), None)
-    year_match = re.search(r"(19|20)\d{2}", query)
-    year = int(year_match.group()) if year_match else None
-    return month, year
-# Extract category from query using simple keyword match
-def extract_category_from_query(query, categories):
-    query = query.lower()
-    for cat in categories:
-        if pd.isna(cat):
-            continue
-        if any(word in query for word in cat.lower().split()):
             return cat
     return None
-# UI
-st.set_page_config(page_title="Illinois Legislative Q&A", layout="wide")
-st.title("📚 Illinois Legislative Trends Q&A")
 df = load_data()
 embed_model, summarizer = load_models()
-query = st.text_input("🔍 Ask a question about a topic, category, or time period (e.g., education in May 2024)")
 if query:
-    month, year = extract_month_year(query)
-    all_categories = df['category_&_subcategory_standardized'].unique()
-    detected_category = extract_category_from_query(query, all_categories)
-    df_filtered = df.copy()
-    if detected_category:
-        df_filtered = df_filtered[df_filtered['category_&_subcategory_standardized'] == detected_category]
-        st.info(f"Filtering by category: **{detected_category}**")
-    if year:
-        df_filtered = df_filtered[df_filtered['status_date'].dt.year == year]
-        if month:
-            df_filtered = df_filtered[df_filtered['status_date'].dt.month == month]
-            st.info(f"Filtering by time: **{datetime(year, month, 1).strftime('%B %Y')}**")
         else:
-            st.info(f"Filtering by year: **{year}**")
-    if df_filtered.empty:
-        st.warning("No matching records found for your query.")
     else:
-        embeddings = compute_embeddings(df_filtered['summary_insight'].tolist(), _model=embed_model)
-        results = semantic_search(query, embeddings, embed_model, threshold=0.4)
-        if not results:
-            st.warning("No relevant summaries found.")
         else:
-            st.subheader("📘 Top Matching Insights:")
-            top_texts = []
-            for idx, score in sorted(results, key=lambda x: x[1], reverse=True)[:5]:
-                row = df_filtered.iloc[idx]
-                st.markdown(f"**🗓️ Date:** {row['status_date'].date()} | **Score:** {score:.2f}")
-                st.markdown(f"**Title:** {row['title']}")
-                st.markdown(f"**Category:** {row['category_&_subcategory_standardized']} | **Goal:** {row['legislative_goal_standardized']}")
-                st.markdown(f"**Intent:** {row['intent_standardized']} | **Stance:** {row['stance_standardized']}")
-                st.markdown(f"```{row['summary_insight'][:500]}```")
-                top_texts.append(row['summary_insight'])
-            st.subheader("🧠 RAG-Generated Summary:")
-            summary = rag_summarize(top_texts, summarizer, top_k=5)
             st.success(summary)

 # Semantic search
 def semantic_search(query, embeddings, model, threshold=0.4):
     query_embedding = model.encode([query])
+    sims = cosine_similarity(query_embedding, embeddings)[0]
+    return [(i, s) for i, s in enumerate(sims) if s > threshold]
 # RAG summarization
+def rag_summarize(texts, summarizer, top_k=5):
+    if not texts:
         return "No relevant content to summarize."
+    vect = TfidfVectorizer()
+    m = vect.fit_transform(texts)
+    mean_vec = m.mean(axis=0).A
+    scores = cosine_similarity(mean_vec, m).flatten()
+    top_indices = scores.argsort()[::-1][:top_k]
+    ctx = "\n".join(texts[i] for i in top_indices)
+    prompt = "summarize: " + ctx[:1024]
+    out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
+    return out[0]['summary_text']
+# Parse month/year
+def extract_month_year(q):
+    month_map = {m: i for i, m in enumerate(
+        ["january", "february", "march", "april", "may", "june",
+         "july", "august", "september", "october", "november", "december"], 1)}
+    ql = q.lower()
+    mon = next((v for k, v in month_map.items() if k in ql), None)
+    ym = re.search(r"(19|20)\d{2}", q)
+    yr = int(ym.group()) if ym else None
+    return mon, yr
+# Auto-detect category
+def extract_category(q, cats):
+    ql = q.lower()
+    for cat in cats:
+        if pd.isna(cat): continue
+        if any(tok in ql for tok in cat.lower().split()):
             return cat
     return None
+# Streamlit UI
+st.set_page_config(page_title="IL Trends Q&A", layout="wide")
+st.title("Illinois Legislative Trends Q&A")
 df = load_data()
 embed_model, summarizer = load_models()
+query = st.text_input("Ask a question (e.g., ‘education in May 2024’):")
 if query:
+    mon, yr = extract_month_year(query)
+    cats = df['category_&_subcategory_standardized'].unique()
+    cat = extract_category(query, cats)
+    df2 = df.copy()
+    if cat:
+        df2 = df2[df2['category_&_subcategory_standardized'] == cat]
+        st.info(f"🔎 Filtering by category: **{cat}**")
+    if yr:
+        df2 = df2[df2['status_date'].dt.year == yr]
+        if mon:
+            df2 = df2[df2['status_date'].dt.month == mon]
+            st.info(f"🔎 Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
         else:
+            st.info(f"🔎 Filtering by year: **{yr}**")
+    if df2.empty:
+        st.warning("No matching records found.")
     else:
+        texts = df2['summary_insight'].tolist()
+        embs = compute_embeddings(texts, _model=embed_model)
+        res = semantic_search(query, embs, embed_model)
+        if not res:
+            st.warning("No relevant insights found.")
         else:
+            st.subheader("Top Matching Insights")
+            collected = []
+            for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:5]:
+                row = df2.iloc[idx]
+                date = row['status_date'].date()
+                cat_std = row['category_&_subcategory_standardized']
+                goal = row['legislative_goal_standardized']
+                intent = row['intent_standardized']
+                stance = row['stance_standardized']
+                trend_summary = row['llama_trend_summary'].strip()
+                st.markdown(f"- **Date:** {date} | **Score:** {score:.2f}")
+                st.markdown(f"  - **Category:** {cat_std}")
+                st.markdown(f"  - **Goal:** {goal}")
+                st.markdown(f"  - **Intent:** {intent} | **Stance:** {stance}")
+                st.markdown(f"  > **Trend Summary:** {trend_summary}")
+                collected.append(row['summary_insight'])
+            st.subheader(" RAG-Generated Summary")
+            summary = rag_summarize(collected, summarizer)
             st.success(summary)