Spaces:

Legislation
/

RAG

Runtime error

App Files Files Community

tjl8 commited on Jul 9, 2025

Commit

d093846

verified ·

1 Parent(s): e631613

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -15

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import TfidfVectorizer
 from datetime import datetime
-# Clean extra boilerplate from model output
 def clean_text(text):
     text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
     text = re.sub(r"(?i)let me know if you'd like.*", "", text)
@@ -15,7 +15,6 @@ def clean_text(text):
     text = re.sub(r"(?i)actionable insight[:]*", "", text)
     return text.strip()
-# Load and preprocess dataset
 @st.cache_data
 def load_data():
     df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
@@ -26,25 +25,21 @@ def load_data():
     df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
     return df
-# Load models (embedding and summarization)
 @st.cache_resource
 def load_models():
     embed_model = SentenceTransformer('all-MiniLM-L6-v2')
     summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
     return embed_model, summarizer
-# Embed documents for semantic search
 @st.cache_data
 def compute_embeddings(texts, _model):
     return _model.encode(texts, show_progress_bar=True)
-# Semantic search using cosine similarity
 def semantic_search(query, embeddings, model, threshold=0.5):
     query_embedding = model.encode([query])
     sims = cosine_similarity(query_embedding, embeddings)[0]
     return [(i, s) for i, s in enumerate(sims) if s > threshold]
-# RAG-style summarization using TF-IDF weighted content
 def rag_summarize(texts, summarizer, top_k=5):
     if not texts:
         return "No relevant content to summarize."
@@ -58,7 +53,6 @@ def rag_summarize(texts, summarizer, top_k=5):
     out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
     return out[0]['summary_text']
-# Extract month and year from query
 def extract_month_year(q):
     month_map = {m: i for i, m in enumerate(
         ["january", "february", "march", "april", "may", "june",
@@ -69,7 +63,6 @@ def extract_month_year(q):
     yr = int(ym.group()) if ym else None
     return mon, yr
-# Match query with goal/category/intent/impact areas
 def extract_topic_match(query, df):
     query_lower = query.lower()
     return df[
@@ -79,16 +72,13 @@ def extract_topic_match(query, df):
         df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
     ]
-# UI Configuration
 st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
 st.title("Illinois Legislative Trends Q&A")
 st.markdown("Ask about trends in topics like higher education, funding, etc.")
-# Load everything
 df = load_data()
 embed_model, summarizer = load_models()
-# Input from user
 query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
 if query:
@@ -128,7 +118,6 @@ if query:
                 intent = row['intent_standardized']
                 stance = row['stance_standardized']
-                # Cleaned text output
                 trend = clean_text(row['llama_trend_summary'])
                 insight = clean_text(row['llama_insight'])
@@ -136,14 +125,13 @@ if query:
                 st.markdown(f"**Category:** {cat_std}")
                 st.markdown(f"**Goal:** {goal}")
                 st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
-                st.markdown(f"**Trend Summary:** {trend}")
-                st.markdown(f"**Actionable Insight:** {insight}")
                 st.markdown(f"[View Full Bill Text]({full_url})\n")
                 st.divider()
                 collected.append(row['summary_insight'])
-            # Show generated summary
             st.subheader("RAG-Generated Overall Summary")
             summary = rag_summarize(collected, summarizer)
             st.success(summary)

 from sklearn.feature_extraction.text import TfidfVectorizer
 from datetime import datetime
 def clean_text(text):
     text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
     text = re.sub(r"(?i)let me know if you'd like.*", "", text)
     text = re.sub(r"(?i)actionable insight[:]*", "", text)
     return text.strip()
 @st.cache_data
 def load_data():
     df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
     df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
     return df
 @st.cache_resource
 def load_models():
     embed_model = SentenceTransformer('all-MiniLM-L6-v2')
     summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
     return embed_model, summarizer
 @st.cache_data
 def compute_embeddings(texts, _model):
     return _model.encode(texts, show_progress_bar=True)
 def semantic_search(query, embeddings, model, threshold=0.5):
     query_embedding = model.encode([query])
     sims = cosine_similarity(query_embedding, embeddings)[0]
     return [(i, s) for i, s in enumerate(sims) if s > threshold]
 def rag_summarize(texts, summarizer, top_k=5):
     if not texts:
         return "No relevant content to summarize."
     out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
     return out[0]['summary_text']
 def extract_month_year(q):
     month_map = {m: i for i, m in enumerate(
         ["january", "february", "march", "april", "may", "june",
     yr = int(ym.group()) if ym else None
     return mon, yr
 def extract_topic_match(query, df):
     query_lower = query.lower()
     return df[
         df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
     ]
 st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
 st.title("Illinois Legislative Trends Q&A")
 st.markdown("Ask about trends in topics like higher education, funding, etc.")
 df = load_data()
 embed_model, summarizer = load_models()
 query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
 if query:
                 intent = row['intent_standardized']
                 stance = row['stance_standardized']
                 trend = clean_text(row['llama_trend_summary'])
                 insight = clean_text(row['llama_insight'])
                 st.markdown(f"**Category:** {cat_std}")
                 st.markdown(f"**Goal:** {goal}")
                 st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
+                st.markdown(f"Trend Summary:{trend}")
+                st.markdown(f"Actionable Insight:{insight}")
                 st.markdown(f"[View Full Bill Text]({full_url})\n")
                 st.divider()
                 collected.append(row['summary_insight'])
             st.subheader("RAG-Generated Overall Summary")
             summary = rag_summarize(collected, summarizer)
             st.success(summary)