Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,7 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
| 7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
from datetime import datetime
|
| 9 |
|
| 10 |
-
|
| 11 |
def clean_text(text):
|
| 12 |
text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
|
| 13 |
text = re.sub(r"(?i)let me know if you'd like.*", "", text)
|
|
@@ -15,7 +15,6 @@ def clean_text(text):
|
|
| 15 |
text = re.sub(r"(?i)actionable insight[:]*", "", text)
|
| 16 |
return text.strip()
|
| 17 |
|
| 18 |
-
# Load and preprocess dataset
|
| 19 |
@st.cache_data
|
| 20 |
def load_data():
|
| 21 |
df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
|
|
@@ -26,25 +25,21 @@ def load_data():
|
|
| 26 |
df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
|
| 27 |
return df
|
| 28 |
|
| 29 |
-
# Load models (embedding and summarization)
|
| 30 |
@st.cache_resource
|
| 31 |
def load_models():
|
| 32 |
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 33 |
summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
|
| 34 |
return embed_model, summarizer
|
| 35 |
|
| 36 |
-
# Embed documents for semantic search
|
| 37 |
@st.cache_data
|
| 38 |
def compute_embeddings(texts, _model):
|
| 39 |
return _model.encode(texts, show_progress_bar=True)
|
| 40 |
|
| 41 |
-
# Semantic search using cosine similarity
|
| 42 |
def semantic_search(query, embeddings, model, threshold=0.5):
|
| 43 |
query_embedding = model.encode([query])
|
| 44 |
sims = cosine_similarity(query_embedding, embeddings)[0]
|
| 45 |
return [(i, s) for i, s in enumerate(sims) if s > threshold]
|
| 46 |
|
| 47 |
-
# RAG-style summarization using TF-IDF weighted content
|
| 48 |
def rag_summarize(texts, summarizer, top_k=5):
|
| 49 |
if not texts:
|
| 50 |
return "No relevant content to summarize."
|
|
@@ -58,7 +53,6 @@ def rag_summarize(texts, summarizer, top_k=5):
|
|
| 58 |
out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
|
| 59 |
return out[0]['summary_text']
|
| 60 |
|
| 61 |
-
# Extract month and year from query
|
| 62 |
def extract_month_year(q):
|
| 63 |
month_map = {m: i for i, m in enumerate(
|
| 64 |
["january", "february", "march", "april", "may", "june",
|
|
@@ -69,7 +63,6 @@ def extract_month_year(q):
|
|
| 69 |
yr = int(ym.group()) if ym else None
|
| 70 |
return mon, yr
|
| 71 |
|
| 72 |
-
# Match query with goal/category/intent/impact areas
|
| 73 |
def extract_topic_match(query, df):
|
| 74 |
query_lower = query.lower()
|
| 75 |
return df[
|
|
@@ -79,16 +72,13 @@ def extract_topic_match(query, df):
|
|
| 79 |
df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
|
| 80 |
]
|
| 81 |
|
| 82 |
-
# UI Configuration
|
| 83 |
st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
|
| 84 |
st.title("Illinois Legislative Trends Q&A")
|
| 85 |
st.markdown("Ask about trends in topics like higher education, funding, etc.")
|
| 86 |
|
| 87 |
-
# Load everything
|
| 88 |
df = load_data()
|
| 89 |
embed_model, summarizer = load_models()
|
| 90 |
|
| 91 |
-
# Input from user
|
| 92 |
query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
|
| 93 |
|
| 94 |
if query:
|
|
@@ -128,7 +118,6 @@ if query:
|
|
| 128 |
intent = row['intent_standardized']
|
| 129 |
stance = row['stance_standardized']
|
| 130 |
|
| 131 |
-
# Cleaned text output
|
| 132 |
trend = clean_text(row['llama_trend_summary'])
|
| 133 |
insight = clean_text(row['llama_insight'])
|
| 134 |
|
|
@@ -136,14 +125,13 @@ if query:
|
|
| 136 |
st.markdown(f"**Category:** {cat_std}")
|
| 137 |
st.markdown(f"**Goal:** {goal}")
|
| 138 |
st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
|
| 139 |
-
st.markdown(f"
|
| 140 |
-
st.markdown(f"
|
| 141 |
st.markdown(f"[View Full Bill Text]({full_url})\n")
|
| 142 |
st.divider()
|
| 143 |
|
| 144 |
collected.append(row['summary_insight'])
|
| 145 |
|
| 146 |
-
# Show generated summary
|
| 147 |
st.subheader("RAG-Generated Overall Summary")
|
| 148 |
summary = rag_summarize(collected, summarizer)
|
| 149 |
st.success(summary)
|
|
|
|
| 7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
from datetime import datetime
|
| 9 |
|
| 10 |
+
|
| 11 |
def clean_text(text):
|
| 12 |
text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
|
| 13 |
text = re.sub(r"(?i)let me know if you'd like.*", "", text)
|
|
|
|
| 15 |
text = re.sub(r"(?i)actionable insight[:]*", "", text)
|
| 16 |
return text.strip()
|
| 17 |
|
|
|
|
| 18 |
@st.cache_data
|
| 19 |
def load_data():
|
| 20 |
df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
|
|
|
|
| 25 |
df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
|
| 26 |
return df
|
| 27 |
|
|
|
|
| 28 |
@st.cache_resource
|
| 29 |
def load_models():
|
| 30 |
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 31 |
summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
|
| 32 |
return embed_model, summarizer
|
| 33 |
|
|
|
|
| 34 |
@st.cache_data
|
| 35 |
def compute_embeddings(texts, _model):
|
| 36 |
return _model.encode(texts, show_progress_bar=True)
|
| 37 |
|
|
|
|
| 38 |
def semantic_search(query, embeddings, model, threshold=0.5):
|
| 39 |
query_embedding = model.encode([query])
|
| 40 |
sims = cosine_similarity(query_embedding, embeddings)[0]
|
| 41 |
return [(i, s) for i, s in enumerate(sims) if s > threshold]
|
| 42 |
|
|
|
|
| 43 |
def rag_summarize(texts, summarizer, top_k=5):
|
| 44 |
if not texts:
|
| 45 |
return "No relevant content to summarize."
|
|
|
|
| 53 |
out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
|
| 54 |
return out[0]['summary_text']
|
| 55 |
|
|
|
|
| 56 |
def extract_month_year(q):
|
| 57 |
month_map = {m: i for i, m in enumerate(
|
| 58 |
["january", "february", "march", "april", "may", "june",
|
|
|
|
| 63 |
yr = int(ym.group()) if ym else None
|
| 64 |
return mon, yr
|
| 65 |
|
|
|
|
| 66 |
def extract_topic_match(query, df):
|
| 67 |
query_lower = query.lower()
|
| 68 |
return df[
|
|
|
|
| 72 |
df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
|
| 73 |
]
|
| 74 |
|
|
|
|
| 75 |
st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
|
| 76 |
st.title("Illinois Legislative Trends Q&A")
|
| 77 |
st.markdown("Ask about trends in topics like higher education, funding, etc.")
|
| 78 |
|
|
|
|
| 79 |
df = load_data()
|
| 80 |
embed_model, summarizer = load_models()
|
| 81 |
|
|
|
|
| 82 |
query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
|
| 83 |
|
| 84 |
if query:
|
|
|
|
| 118 |
intent = row['intent_standardized']
|
| 119 |
stance = row['stance_standardized']
|
| 120 |
|
|
|
|
| 121 |
trend = clean_text(row['llama_trend_summary'])
|
| 122 |
insight = clean_text(row['llama_insight'])
|
| 123 |
|
|
|
|
| 125 |
st.markdown(f"**Category:** {cat_std}")
|
| 126 |
st.markdown(f"**Goal:** {goal}")
|
| 127 |
st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
|
| 128 |
+
st.markdown(f"Trend Summary:{trend}")
|
| 129 |
+
st.markdown(f"Actionable Insight:{insight}")
|
| 130 |
st.markdown(f"[View Full Bill Text]({full_url})\n")
|
| 131 |
st.divider()
|
| 132 |
|
| 133 |
collected.append(row['summary_insight'])
|
| 134 |
|
|
|
|
| 135 |
st.subheader("RAG-Generated Overall Summary")
|
| 136 |
summary = rag_summarize(collected, summarizer)
|
| 137 |
st.success(summary)
|