Spaces:

Legislation
/

RAG

Runtime error

App Files Files Community

tjl8 commited on Jul 23, 2025

Commit

65e5177

verified ·

1 Parent(s): a64bf13

Update app.py

Browse files

Files changed (1) hide show

app.py +187 -187

app.py CHANGED Viewed

@@ -674,194 +674,194 @@
 #BART
- import streamlit as st
- import pandas as pd
- import re
- from sentence_transformers import SentenceTransformer
- from transformers import pipeline
- from sklearn.metrics.pairwise import cosine_similarity
- from sklearn.feature_extraction.text import TfidfVectorizer
- from datetime import datetime
- def clean_text(text):
-     text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
-     text = re.sub(r"(?i)let me know if you'd like.*", "", text)
-     text = re.sub(r"(?i)trend summary[:]*", "", text)
-     text = re.sub(r"(?i)actionable insight[:]*", "", text)
-     return text.strip()
- @st.cache_data
- def load_data():
-     df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
-     df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
-     df = df.dropna(subset=['status_date'])
-     for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
-                 "Intended Beneficiaries", "Potential Impact", "description"]:
-         df[col] = df[col].fillna("")
-     df["combined_text"] = (
-         "Legislative Goal: " + df["Legislative Goal"] + "\n" +
-         "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
-         "Key Provisions: " + df["Key Provisions"] + "\n" +
-         "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
-         "Potential Impact: " + df["Potential Impact"] + "\n" +
-         "Description: " + df["description"]
-     )
-     return df
- @st.cache_resource
- def load_models():
-     embed_model = SentenceTransformer('all-MiniLM-L6-v2')
-     # Changed summarization model to facebook/bart-large-cnn for better summary quality
-     summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
-     return embed_model, summarizer
- @st.cache_data
- def compute_embeddings(texts, _model):
-     return _model.encode(texts, show_progress_bar=True)
- def semantic_search(query, embeddings, model, threshold=0.5):
-     query_embedding = model.encode([query])
-     sims = cosine_similarity(query_embedding, embeddings)[0]
-     return [(i, s) for i, s in enumerate(sims) if s > threshold]
- def rag_summarize(texts, summarizer, top_k=5):
-     if not texts:
-         return "No relevant content to summarize."
-     vect = TfidfVectorizer()
-     m = vect.fit_transform(texts)
-     mean_vec = m.mean(axis=0).A
-     scores = cosine_similarity(mean_vec, m).flatten()
-     top_indices = scores.argsort()[::-1][:top_k]
-     ctx = "\n".join(texts[i] for i in top_indices)
-     prompt = "summarize: " + ctx[:1024]
-     out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
-     return out[0]['summary_text']
- def extract_month_year(q):
-     month_map = {m: i for i, m in enumerate(
-         ["january", "february", "march", "april", "may", "june",
-          "july", "august", "september", "october", "november", "december"], 1)}
-     ql = q.lower()
-     mon = next((v for k, v in month_map.items() if k in ql), None)
-     ym = re.search(r"(19|20)\d{2}", q)
-     yr = int(ym.group()) if ym else None
-     return mon, yr
- def extract_date_range(query):
-     month_map = {
-         "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
-         "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
-     }
-     patterns = [
-         r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
-     ]
-     for pattern in patterns:
-         match = re.search(pattern, query)
-         if match:
-             start_month_str, start_year = match.group(1).lower(), int(match.group(2))
-             end_month_str, end_year = match.group(3).lower(), int(match.group(4))
-             start_month = month_map.get(start_month_str)
-             end_month = month_map.get(end_month_str)
-             if start_month and end_month:
-                 start_date = datetime(start_year, start_month, 1)
-                 end_date = datetime(end_year, end_month, 28)
-                 return start_date, end_date
-     return None, None
- def extract_topic_match(query, df):
-     query_lower = query.lower()
-     return df[
-         df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
-         df['Intent'].fillna('').str.lower().str.contains(query_lower) |
-         df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
-         df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
-         df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
-         df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
-     ]
- st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
- st.title("Illinois Legislative Trends Q&A")
- st.markdown("Ask about trends in topics like higher education, funding, etc.")
- df = load_data()
- embed_model, summarizer = load_models()
- query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025’):")
- if query:
-     start_date, end_date = extract_date_range(query)
-     df2 = extract_topic_match(query, df)
-     if df2.empty:
-         df2 = df
-     if start_date and end_date:
-         df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
-         st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
-     else:
-         mon, yr = extract_month_year(query)
-         if yr:
-             df2 = df2[df2['status_date'].dt.year == yr]
-             if mon:
-                 df2 = df2[df2['status_date'].dt.month == mon]
-                 st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
-             else:
-                 st.info(f"Filtering by year: **{yr}**")
-     if df2.empty:
-         st.warning("No matching records found.")
-     else:
-         texts = df2['combined_text'].tolist()
-         embs = compute_embeddings(texts, _model=embed_model)
-         res = semantic_search(query, embs, embed_model, threshold=0.5)
-         if not res:
-             st.warning("No relevant insights found.")
-         else:
-             st.subheader("Top Matching Insights")
-             collected = []
-             for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
-                 row = df2.iloc[idx]
-                 date = row['status_date'].date()
-                 bill_number = row['bill_number']
-                 full_url = row['url']
-                 cat = row.get('Category & Subcategory', '')
-                 bene = row.get('Intended Beneficiaries', '')
-                 goal = row.get('Legislative Goal', '')
-                 impact = row.get('Policy Impact Areas', '')
-                 provision = row.get('Key Provisions', '')
-                 intent = row.get('Intent', '')
-                 stance = row.get('Stance', '')
-                 description = row.get('description', '')
-                 st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
-                 st.markdown(f"**Category:** {cat}")
-                 st.markdown(f"**Intended Beneficiaries:** {bene}")
-                 st.markdown(f"**Goal:** {goal}")
-                 st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
-                 st.markdown(f"**Policy Impact Area:** {impact}")
-                 st.markdown(f"**Key Provision:** {provision}")
-                 st.markdown(f"**Description:** {description}")
-                 st.markdown(f"[View Full Bill Text]({full_url})\n")
-                 st.divider()
-                 collected.append(row['combined_text'])
-             st.subheader("RAG-Generated Overall Summary")
-             summary = rag_summarize(collected, summarizer)
-             st.success(summary)

 #BART
+import streamlit as st
+import pandas as pd
+import re
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import TfidfVectorizer
+from datetime import datetime
+def clean_text(text):
+    text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
+    text = re.sub(r"(?i)let me know if you'd like.*", "", text)
+    text = re.sub(r"(?i)trend summary[:]*", "", text)
+    text = re.sub(r"(?i)actionable insight[:]*", "", text)
+    return text.strip()
+@st.cache_data
+def load_data():
+    df = pd.read_csv("Illinois_Education_Bills_Summarized_With Features_2021_2025_07182025.csv")
+    df['status_date'] = pd.to_datetime(df['status_date'], format='%d-%m-%Y', errors='coerce')
+    df = df.dropna(subset=['status_date'])
+    for col in ["Legislative Goal", "Policy Impact Areas", "Key Provisions",
+                "Intended Beneficiaries", "Potential Impact", "description"]:
+        df[col] = df[col].fillna("")
+    df["combined_text"] = (
+        "Legislative Goal: " + df["Legislative Goal"] + "\n" +
+        "Policy Impact Areas: " + df["Policy Impact Areas"] + "\n" +
+        "Key Provisions: " + df["Key Provisions"] + "\n" +
+        "Intended Beneficiaries: " + df["Intended Beneficiaries"] + "\n" +
+        "Potential Impact: " + df["Potential Impact"] + "\n" +
+        "Description: " + df["description"]
+    )
+    return df
+@st.cache_resource
+def load_models():
+    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
+    # Changed summarization model to facebook/bart-large-cnn for better summary quality
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
+    return embed_model, summarizer
+@st.cache_data
+def compute_embeddings(texts, _model):
+    return _model.encode(texts, show_progress_bar=True)
+def semantic_search(query, embeddings, model, threshold=0.5):
+    query_embedding = model.encode([query])
+    sims = cosine_similarity(query_embedding, embeddings)[0]
+    return [(i, s) for i, s in enumerate(sims) if s > threshold]
+def rag_summarize(texts, summarizer, top_k=5):
+    if not texts:
+        return "No relevant content to summarize."
+    vect = TfidfVectorizer()
+    m = vect.fit_transform(texts)
+    mean_vec = m.mean(axis=0).A
+    scores = cosine_similarity(mean_vec, m).flatten()
+    top_indices = scores.argsort()[::-1][:top_k]
+    ctx = "\n".join(texts[i] for i in top_indices)
+    prompt = "summarize: " + ctx[:1024]
+    out = summarizer(prompt, max_length=200, min_length=80, do_sample=False)
+    return out[0]['summary_text']
+def extract_month_year(q):
+    month_map = {m: i for i, m in enumerate(
+        ["january", "february", "march", "april", "may", "june",
+         "july", "august", "september", "october", "november", "december"], 1)}
+    ql = q.lower()
+    mon = next((v for k, v in month_map.items() if k in ql), None)
+    ym = re.search(r"(19|20)\d{2}", q)
+    yr = int(ym.group()) if ym else None
+    return mon, yr
+def extract_date_range(query):
+    month_map = {
+        "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
+        "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
+    }
+    patterns = [
+        r"(?i)(?:from|between)?\s*([a-zA-Z]+)\s+(\d{4})\s*(?:to|through|and|-)\s*([a-zA-Z]+)\s+(\d{4})",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, query)
+        if match:
+            start_month_str, start_year = match.group(1).lower(), int(match.group(2))
+            end_month_str, end_year = match.group(3).lower(), int(match.group(4))
+            start_month = month_map.get(start_month_str)
+            end_month = month_map.get(end_month_str)
+            if start_month and end_month:
+                start_date = datetime(start_year, start_month, 1)
+                end_date = datetime(end_year, end_month, 28)
+                return start_date, end_date
+    return None, None
+def extract_topic_match(query, df):
+    query_lower = query.lower()
+    return df[
+        df['Category & Subcategory'].fillna('').str.lower().str.contains(query_lower) |
+        df['Intent'].fillna('').str.lower().str.contains(query_lower) |
+        df['Legislative Goal'].fillna('').str.lower().str.contains(query_lower) |
+        df['Policy Impact Areas'].fillna('').str.lower().str.contains(query_lower) |
+        df['Key Provisions'].fillna('').str.lower().str.contains(query_lower) |
+        df['Potential Impact'].fillna('').str.lower().str.contains(query_lower)
+    ]
+st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
+st.title("Illinois Legislative Trends Q&A")
+st.markdown("Ask about trends in topics like higher education, funding, etc.")
+df = load_data()
+embed_model, summarizer = load_models()
+query = st.text_input("Ask a question (e.g., ‘Trends from Jan 2024 to May 2025���):")
+if query:
+    start_date, end_date = extract_date_range(query)
+    df2 = extract_topic_match(query, df)
+    if df2.empty:
+        df2 = df
+    if start_date and end_date:
+        df2 = df2[(df2['status_date'] >= start_date) & (df2['status_date'] <= end_date)]
+        st.info(f"Filtering between: **{start_date:%B %Y}** and **{end_date:%B %Y}**")
+    else:
+        mon, yr = extract_month_year(query)
+        if yr:
+            df2 = df2[df2['status_date'].dt.year == yr]
+            if mon:
+                df2 = df2[df2['status_date'].dt.month == mon]
+                st.info(f"Filtering by date: **{datetime(yr, mon, 1):%B %Y}**")
+            else:
+                st.info(f"Filtering by year: **{yr}**")
+    if df2.empty:
+        st.warning("No matching records found.")
+    else:
+        texts = df2['combined_text'].tolist()
+        embs = compute_embeddings(texts, _model=embed_model)
+        res = semantic_search(query, embs, embed_model, threshold=0.5)
+        if not res:
+            st.warning("No relevant insights found.")
+        else:
+            st.subheader("Top Matching Insights")
+            collected = []
+            for idx, score in sorted(res, key=lambda x: x[1], reverse=True)[:10]:
+                row = df2.iloc[idx]
+                date = row['status_date'].date()
+                bill_number = row['bill_number']
+                full_url = row['url']
+                cat = row.get('Category & Subcategory', '')
+                bene = row.get('Intended Beneficiaries', '')
+                goal = row.get('Legislative Goal', '')
+                impact = row.get('Policy Impact Areas', '')
+                provision = row.get('Key Provisions', '')
+                intent = row.get('Intent', '')
+                stance = row.get('Stance', '')
+                description = row.get('description', '')
+                st.markdown(f"**Date:** {date} | **Bill Number:** {bill_number} | **Score:** {score:.2f}")
+                st.markdown(f"**Category:** {cat}")
+                st.markdown(f"**Intended Beneficiaries:** {bene}")
+                st.markdown(f"**Goal:** {goal}")
+                st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
+                st.markdown(f"**Policy Impact Area:** {impact}")
+                st.markdown(f"**Key Provision:** {provision}")
+                st.markdown(f"**Description:** {description}")
+                st.markdown(f"[View Full Bill Text]({full_url})\n")
+                st.divider()
+                collected.append(row['combined_text'])
+            st.subheader("RAG-Generated Overall Summary")
+            summary = rag_summarize(collected, summarizer)
+            st.success(summary)