tjl8 commited on
Commit
d093846
·
verified ·
1 Parent(s): e631613

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -15
app.py CHANGED
@@ -7,7 +7,7 @@ from sklearn.metrics.pairwise import cosine_similarity
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from datetime import datetime
9
 
10
- # Clean extra boilerplate from model output
11
  def clean_text(text):
12
  text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
13
  text = re.sub(r"(?i)let me know if you'd like.*", "", text)
@@ -15,7 +15,6 @@ def clean_text(text):
15
  text = re.sub(r"(?i)actionable insight[:]*", "", text)
16
  return text.strip()
17
 
18
- # Load and preprocess dataset
19
  @st.cache_data
20
  def load_data():
21
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
@@ -26,25 +25,21 @@ def load_data():
26
  df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
27
  return df
28
 
29
- # Load models (embedding and summarization)
30
  @st.cache_resource
31
  def load_models():
32
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
33
  summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
34
  return embed_model, summarizer
35
 
36
- # Embed documents for semantic search
37
  @st.cache_data
38
  def compute_embeddings(texts, _model):
39
  return _model.encode(texts, show_progress_bar=True)
40
 
41
- # Semantic search using cosine similarity
42
  def semantic_search(query, embeddings, model, threshold=0.5):
43
  query_embedding = model.encode([query])
44
  sims = cosine_similarity(query_embedding, embeddings)[0]
45
  return [(i, s) for i, s in enumerate(sims) if s > threshold]
46
 
47
- # RAG-style summarization using TF-IDF weighted content
48
  def rag_summarize(texts, summarizer, top_k=5):
49
  if not texts:
50
  return "No relevant content to summarize."
@@ -58,7 +53,6 @@ def rag_summarize(texts, summarizer, top_k=5):
58
  out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
59
  return out[0]['summary_text']
60
 
61
- # Extract month and year from query
62
  def extract_month_year(q):
63
  month_map = {m: i for i, m in enumerate(
64
  ["january", "february", "march", "april", "may", "june",
@@ -69,7 +63,6 @@ def extract_month_year(q):
69
  yr = int(ym.group()) if ym else None
70
  return mon, yr
71
 
72
- # Match query with goal/category/intent/impact areas
73
  def extract_topic_match(query, df):
74
  query_lower = query.lower()
75
  return df[
@@ -79,16 +72,13 @@ def extract_topic_match(query, df):
79
  df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
80
  ]
81
 
82
- # UI Configuration
83
  st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
84
  st.title("Illinois Legislative Trends Q&A")
85
  st.markdown("Ask about trends in topics like higher education, funding, etc.")
86
 
87
- # Load everything
88
  df = load_data()
89
  embed_model, summarizer = load_models()
90
 
91
- # Input from user
92
  query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
93
 
94
  if query:
@@ -128,7 +118,6 @@ if query:
128
  intent = row['intent_standardized']
129
  stance = row['stance_standardized']
130
 
131
- # Cleaned text output
132
  trend = clean_text(row['llama_trend_summary'])
133
  insight = clean_text(row['llama_insight'])
134
 
@@ -136,14 +125,13 @@ if query:
136
  st.markdown(f"**Category:** {cat_std}")
137
  st.markdown(f"**Goal:** {goal}")
138
  st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
139
- st.markdown(f"**Trend Summary:** {trend}")
140
- st.markdown(f"**Actionable Insight:** {insight}")
141
  st.markdown(f"[View Full Bill Text]({full_url})\n")
142
  st.divider()
143
 
144
  collected.append(row['summary_insight'])
145
 
146
- # Show generated summary
147
  st.subheader("RAG-Generated Overall Summary")
148
  summary = rag_summarize(collected, summarizer)
149
  st.success(summary)
 
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from datetime import datetime
9
 
10
+
11
  def clean_text(text):
12
  text = re.sub(r"(?i)(here is|here are) the requested output[s]*[:]*", "", text)
13
  text = re.sub(r"(?i)let me know if you'd like.*", "", text)
 
15
  text = re.sub(r"(?i)actionable insight[:]*", "", text)
16
  return text.strip()
17
 
 
18
  @st.cache_data
19
  def load_data():
20
  df = pd.read_csv("Illinois_Entire_Data_Insights_Final_v2.csv")
 
25
  df["summary_insight"] = df["llama_trend_summary"] + "\n" + df["llama_insight"]
26
  return df
27
 
 
28
  @st.cache_resource
29
  def load_models():
30
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
31
  summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
32
  return embed_model, summarizer
33
 
 
34
  @st.cache_data
35
  def compute_embeddings(texts, _model):
36
  return _model.encode(texts, show_progress_bar=True)
37
 
 
38
  def semantic_search(query, embeddings, model, threshold=0.5):
39
  query_embedding = model.encode([query])
40
  sims = cosine_similarity(query_embedding, embeddings)[0]
41
  return [(i, s) for i, s in enumerate(sims) if s > threshold]
42
 
 
43
  def rag_summarize(texts, summarizer, top_k=5):
44
  if not texts:
45
  return "No relevant content to summarize."
 
53
  out = summarizer(prompt, max_length=60, min_length=30, do_sample=False)
54
  return out[0]['summary_text']
55
 
 
56
  def extract_month_year(q):
57
  month_map = {m: i for i, m in enumerate(
58
  ["january", "february", "march", "april", "may", "june",
 
63
  yr = int(ym.group()) if ym else None
64
  return mon, yr
65
 
 
66
  def extract_topic_match(query, df):
67
  query_lower = query.lower()
68
  return df[
 
72
  df['policy_impact_areas_standardized'].fillna('').str.lower().str.contains(query_lower)
73
  ]
74
 
 
75
  st.set_page_config(page_title="IL Legislative Trends Q&A", layout="wide")
76
  st.title("Illinois Legislative Trends Q&A")
77
  st.markdown("Ask about trends in topics like higher education, funding, etc.")
78
 
 
79
  df = load_data()
80
  embed_model, summarizer = load_models()
81
 
 
82
  query = st.text_input("Ask a question (e.g., ‘Trends in higher education in 2024’):")
83
 
84
  if query:
 
118
  intent = row['intent_standardized']
119
  stance = row['stance_standardized']
120
 
 
121
  trend = clean_text(row['llama_trend_summary'])
122
  insight = clean_text(row['llama_insight'])
123
 
 
125
  st.markdown(f"**Category:** {cat_std}")
126
  st.markdown(f"**Goal:** {goal}")
127
  st.markdown(f"**Intent:** {intent} | **Stance:** {stance}")
128
+ st.markdown(f"Trend Summary:{trend}")
129
+ st.markdown(f"Actionable Insight:{insight}")
130
  st.markdown(f"[View Full Bill Text]({full_url})\n")
131
  st.divider()
132
 
133
  collected.append(row['summary_insight'])
134
 
 
135
  st.subheader("RAG-Generated Overall Summary")
136
  summary = rag_summarize(collected, summarizer)
137
  st.success(summary)