Cassie Buhler commited on
Commit
9aa9417
·
unverified ·
2 Parent(s): 6d5acd072c19d9

Merge pull request #4 from boettiger-lab/patch/testing

Browse files
Files changed (6) hide show
  1. app/app.py +148 -62
  2. app/footer.md +12 -0
  3. app/system_prompt.txt +84 -0
  4. app/utils.py +298 -42
  5. app/variables.py +59 -0
  6. requirements.txt +5 -1
app/app.py CHANGED
@@ -1,15 +1,13 @@
1
  import os
2
-
3
  import altair as alt
4
- import ibis
5
  import leafmap.maplibregl as leafmap
6
  import matplotlib.pyplot as plt
7
  import pandas as pd
8
  import streamlit as st
9
- from ibis import _
10
-
11
  from utils import *
12
-
 
13
 
14
  st.set_page_config(
15
  layout="wide",
@@ -32,22 +30,19 @@ st.caption(
32
 
33
  "ℹ️ Tip: Use the slider to change the year and hover over shaded areas for measure details."
34
 
35
-
36
-
37
  min_year, max_year = st.slider("Select a range", 1988, 2024, (2020, 2024))
38
-
39
  con = ibis.duckdb.connect("duck.db", extensions=["spatial"])
40
  current_tables = con.list_tables()
41
 
42
- if "mydata" not in set(current_tables):
43
  tbl = (
44
  con.read_parquet(votes_parquet)
45
  .cast({"geom": "geometry"})
46
  )
47
  tbl = get_unique_rows(tbl) # drop multi-county measures with non-unanimous party labels
48
- con.create_table("mydata", tbl)
49
 
50
- votes = con.table("mydata")
51
 
52
  with st.sidebar:
53
  color_choice = st.radio("Color by:", ["Measure status", "Political Party"])
@@ -58,6 +53,68 @@ with st.sidebar:
58
  social_toggle = st.toggle("Social Vulnerability Index")
59
  justice_toggle = st.toggle("Climate and Economic Justic")
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  m = leafmap.Map(
63
  style="positron",
@@ -95,69 +152,98 @@ if justice_toggle:
95
  )
96
 
97
 
98
- # compute percentage passed in given year
99
- passed_year = (
100
- votes
101
- .filter((_.year>= min_year) & (_.year<= max_year))
102
- .filter(_.status.isin(["Pass", "Pass*"]))
103
- .count()
104
- .execute()
105
- )
106
- total_year = votes.filter((_.year>= min_year) & (_.year<= max_year)).count().execute()
107
- year_passed = round(passed_year / total_year * 100, 2)
108
- f"{year_passed}% Measures Passed between {min_year} and {max_year}"
109
-
110
- # compute percentage passed over entire dataset
111
- passed = votes.filter(_.status.isin(["Pass", "Pass*"])).count().execute()
112
- total = votes.count().execute()
113
- overall_passed = round(passed / total * 100, 2)
114
- f"{overall_passed}% Measures Passed from 1988 - 2024 \n"
115
-
116
-
117
- if color_choice == "Measure status":
118
- for j, o in zip(
119
- ["State", "County", "Municipal", "Special District"],
120
- [0.8, 1, 1, 1],
121
- ):
122
  m.add_pmtiles(
123
  votes_pmtiles,
124
- style=get_status_style(j,min_year,max_year),
125
  visible=True,
126
- opacity=o,
127
  tooltip=True,
 
128
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- elif color_choice == "Political Party":
131
- for j, o in zip(
132
- ["State", "County", "Municipal", "Special District"],
133
- [0.8, 1, 1, 1],
134
- ):
135
  m.add_pmtiles(
136
  votes_pmtiles,
137
- style=get_party_landvote_style(j,min_year,max_year),
138
  visible=True,
139
- opacity=o,
140
  tooltip=True,
 
141
  )
142
 
143
- m.add_layer_control()
144
- m.to_streamlit()
145
-
146
- party_df = get_party_df(votes)
147
- st.altair_chart(party_chart(party_df), use_container_width=True)
148
-
149
- df_funding = funding_chart(votes)
150
- st.altair_chart(
151
- create_chart(
152
- df_funding,
153
- "cumulative_funding",
154
- "Billions of Dollars",
155
- "Cumulative Funding",
156
- colors["dark_green"],
157
- chart_type="bar",
158
- ),
159
- use_container_width=True,
160
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  st.divider()
163
  st.caption(
 
1
  import os
2
+ import openai
3
  import altair as alt
 
4
  import leafmap.maplibregl as leafmap
5
  import matplotlib.pyplot as plt
6
  import pandas as pd
7
  import streamlit as st
 
 
8
  from utils import *
9
+ import ibis
10
+ from ibis import _
11
 
12
  st.set_page_config(
13
  layout="wide",
 
30
 
31
  "ℹ️ Tip: Use the slider to change the year and hover over shaded areas for measure details."
32
 
 
 
33
  min_year, max_year = st.slider("Select a range", 1988, 2024, (2020, 2024))
 
34
  con = ibis.duckdb.connect("duck.db", extensions=["spatial"])
35
  current_tables = con.list_tables()
36
 
37
+ if "landvote" not in set(current_tables):
38
  tbl = (
39
  con.read_parquet(votes_parquet)
40
  .cast({"geom": "geometry"})
41
  )
42
  tbl = get_unique_rows(tbl) # drop multi-county measures with non-unanimous party labels
43
+ con.create_table("landvote", tbl)
44
 
45
+ votes = con.table("landvote")
46
 
47
  with st.sidebar:
48
  color_choice = st.radio("Color by:", ["Measure status", "Political Party"])
 
53
  social_toggle = st.toggle("Social Vulnerability Index")
54
  justice_toggle = st.toggle("Climate and Economic Justic")
55
 
56
+ ##### Chatbot stuff
57
+ chatbot_container = st.container()
58
+ with chatbot_container:
59
+ llm_left_col, llm_right_col = st.columns([5,1], vertical_alignment = "bottom")
60
+ with llm_left_col:
61
+ with st.popover("💬 Example Queries"):
62
+ '''
63
+ Mapping queries:
64
+ - Show me Republican-voting counties where conservation measures passed
65
+ - Show measures that failed narrowly (between 45% and 50% yes)
66
+ - Show me conservation measures that approved over $500 million
67
+ '''
68
+
69
+ '''
70
+ Exploratory data queries:
71
+ - Which year had the most conservation funds approved?
72
+ - Which state approved the largest total conservation funding?
73
+ - How many measures passed by jurisdiction type?
74
+ - Which counties voted on conservation measures most frequently?
75
+ - What is the median funding amount for passed measures?
76
+ - How often do bond measures pass compared to other finance mechanisms?
77
+ '''
78
+
79
+ st.info('If the map appears blank, queried data may be too small to see at the default zoom level. Check the table below the map, as query results will also be displayed there.', icon="ℹ️")
80
+
81
+ with llm_right_col:
82
+ llm_choice = st.selectbox("Select LLM:", llm_options, key = "llm", help = "Select which model to use.")
83
+ llm = llm_options[llm_choice]
84
+
85
+ run_sql = make_run_sql(votes, llm, con)
86
+
87
+ with chatbot_container:
88
+ with llm_left_col:
89
+ example_query = "👋 Input query here"
90
+ prompt = st.chat_input(example_query, key="chain", max_chars=300)
91
+ _,log_query_col, _ = st.columns([.001, 5,1], vertical_alignment = "top")
92
+ with log_query_col:
93
+ log_queries = st.checkbox("Save query", value = True, help = "Saving your queries helps improve this tool and guide conservation efforts. Your data is stored in a private location. For more details, see 'Why save your queries?' at the bottom of this page.")
94
+
95
+ # new container for output so it doesn't mess with the alignment of llm options
96
+ with st.container():
97
+ if prompt:
98
+ result = handle_llm_query(
99
+ prompt=prompt,
100
+ llm_choice=llm_choice,
101
+ run_sql_fn=run_sql, # your cached function: run_sql(prompt, llm_choice)
102
+ log_queries=log_queries,
103
+ logger_fn=minio_logger,
104
+ log_file="landvote_query_log.csv",
105
+ log_bucket="shared-tpl",
106
+ )
107
+
108
+ llm_output = result["llm_output"]
109
+ sql_query = result["sql_query"]
110
+ llm_explanation = result["llm_explanation"]
111
+ unique_ids = result["unique_ids"]
112
+ llm_cols = result["llm_cols"]
113
+ llm_bounds = result["llm_bounds"]
114
+ not_mapping = result["not_mapping"]
115
+
116
+ ##### end of chatbot code
117
+
118
 
119
  m = leafmap.Map(
120
  style="positron",
 
152
  )
153
 
154
 
155
+ # define PMTiles style dict (if we didn't already do so using the chatbot)
156
+ if 'llm_output' in locals():
157
+ if not_mapping == False:
158
+ # filter to ids from result
159
+ style = llm_pmtiles_style(unique_ids, paint_fill, votes_pmtiles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  m.add_pmtiles(
161
  votes_pmtiles,
162
+ style=style,
163
  visible=True,
164
+ opacity=1.0,
165
  tooltip=True,
166
+ name="LLM Query Results",
167
  )
168
+
169
+ # Zoom to result bounds if present
170
+ if "llm_bounds" in locals() and llm_bounds:
171
+ m.fit_bounds(llm_bounds)
172
+ m.to_streamlit()
173
+ with st.expander("🔍 View/download data"): # adding data table
174
+ if ('geom' in llm_output.columns) and (not llm_output.empty):
175
+ llm_output = llm_output.drop('geom',axis = 1)
176
+ st.dataframe(llm_output, use_container_width = True)
177
+
178
+ else: # if we didn't use chatbot
179
+
180
+ # compute percentage passed in given year
181
+ year_passed, overall_passed=get_pass_stats(votes, min_year, max_year)
182
+ f"{year_passed}% Measures Passed between {min_year} and {max_year}"
183
+ f"{overall_passed}% Measures Passed from 1988 - 2024 \n"
184
+
185
+ if color_choice == "Measure status":
186
+ # 4 styles / 4 layers (jurisdiction-specific)
187
+ for j, o in zip(
188
+ ["State", "County", "Municipal", "Special District"],
189
+ [0.8, 1, 1, 1],
190
+ ):
191
+ m.add_pmtiles(
192
+ votes_pmtiles,
193
+ style=get_status_style(j, min_year, max_year),
194
+ visible=True,
195
+ opacity=o,
196
+ tooltip=True,
197
+ name=j, # shows as separate toggles in layer control
198
+ )
199
+ m.to_streamlit()
200
+
201
+
202
+ elif color_choice == "Political Party":
203
+ # 1 style / 1 layer
204
+ style = get_party_landvote_style(min_year, max_year)
205
 
 
 
 
 
 
206
  m.add_pmtiles(
207
  votes_pmtiles,
208
+ style=style,
209
  visible=True,
210
+ opacity=1.0,
211
  tooltip=True,
212
+ name="Political Party",
213
  )
214
 
215
+ m.to_streamlit()
216
+
217
+ with st.expander("🔍 View/download data"): # adding data table
218
+ group_cols = ['landvote_id','year','state','county','municipal','jurisdiction']
219
+ gdf_grouped = (votes.head(100).execute().groupby(group_cols)
220
+ .agg({col: ('sum' if col in ['total_funds_at_stake','total_funds_approved',
221
+ 'conservation_funds_at_stake','conservation_funds_approved'] else 'first')
222
+ for col in votes.columns if col not in group_cols})).reset_index()
223
+ cols = ['landvote_id','year','state','county','municipal','jurisdiction',
224
+ 'status', 'percent_yes', 'percent_no', 'date',
225
+ 'total_funds_at_stake','total_funds_approved',
226
+ 'conservation_funds_at_stake','conservation_funds_approved',
227
+ 'finance_mechanism', 'other_comment','purpose',
228
+ 'description', 'notes', 'voted_acq_measure', 'party']
229
+
230
+ st.dataframe(gdf_grouped[cols], use_container_width = True)
231
+
232
+ party_df = get_party_df(votes)
233
+ st.altair_chart(party_chart(party_df), use_container_width=True)
234
+
235
+ df_funding = funding_chart(votes)
236
+ st.altair_chart(
237
+ create_chart(
238
+ df_funding,
239
+ "cumulative_funding",
240
+ "Billions of Dollars",
241
+ "Cumulative Funding",
242
+ colors["dark_green"],
243
+ chart_type="bar",
244
+ ),
245
+ use_container_width=True,
246
+ )
247
 
248
  st.divider()
249
  st.caption(
app/footer.md CHANGED
@@ -24,3 +24,15 @@ For details on methodology, please refer to the our data processing code for [La
24
  - CDC 2020 Social Vulnerability Index by US Census Track. Data: https://source.coop/repositories/cboettig/social-vulnerability/description. License: Public Domain
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  - CDC 2020 Social Vulnerability Index by US Census Track. Data: https://source.coop/repositories/cboettig/social-vulnerability/description. License: Public Domain
25
 
26
 
27
+ #### LLMs
28
+ This app can use a selection of open-weights language models hosted on the National Research Platform (https://nrp.ai/documentation/userdocs/ai/llm-managed/), and Open Router (https://openrouter.ai/models).
29
+
30
+ ---
31
+
32
+ ### Why save your queries?
33
+ Conservation researchers and practitioners are interested in **learning what matters most to our community**.
34
+
35
+ By saving your anonymous queries, we can identify which topics and areas are drawing the most attention, helping us improve future tools and data products to **better support conservation efforts**. We also save the LLM’s response to each query to monitor its accuracy and ensure the system is working as intended.
36
+
37
+ You can opt out at any time by disabling “Save query”.
38
+
app/system_prompt.txt ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an expert in SQL and an assistant for mapping and analyzing the Trust for Public Land (TPL) LandVote data. You are provided multiple tables and must identify which table(s) to use. Given an input question, create a syntactically correct {dialect} query to run, and then provide an explanation of how you answered the input question. Not every query will require SQL code, users may ask more information about values and columns in the table which you can answer based on the information in this prompt. For these cases, your "sql_query" field should be empty.
2
+
3
+ ONLY write SQL queries using the records and columns that exist in the relevant table. You have access to this table:
4
+
5
+ landvote:
6
+ - Definition: Tracks land conservation ballot measures since 1988.
7
+ - Schema: {landvote}
8
+
9
+ For example:
10
+ {{
11
+ "sql_query": "SELECT cols FROM mydata WHERE condition = 'value';",
12
+ "explanation": "This query retrieves columns from my_table where the condition column equals 'value'."
13
+ }}
14
+
15
+ Ensure the response contains only this JSON object, with no additional text, formatting, or commentary.
16
+
17
+ # Important Details
18
+ - For visualization-related queries (e.g., "show me"), ALWAYS include "landvote_id", "year","state","county","municipal","jurisdiction", and "geom" in the results,
19
+ - Wrap each column name in double quotes (") to denote them as delimited identifiers.
20
+ - Wrap values that are strings in single quotes (') to distinguish them from column names.
21
+
22
+ # Example Questions and How to Approach Them
23
+
24
+ ## Example:
25
+ example_user: "Show me measures that passed in Republican-voting counties"
26
+ example_assistant: {{"sql_query":
27
+ SELECT DISTINCT "landvote_id", "year","state","county","municipal","jurisdiction", "geom", "party", "status" FROM landvote
28
+ WHERE "party" == 'Republican'
29
+ AND "jurisdiction"=='County'
30
+ AND "status" IN ('Pass','Pass*')
31
+ ORDER BY "year";
32
+ "explanation":"I filtered the landvote table to county-level measures that passed and occurred in jurisdictions classified as Republican based on the most recent presidential election prior to the measure.
33
+ }}
34
+
35
+ ## Example:
36
+ example_user: "Show me measures that failed narrowly (between 45% and 50% yes)"
37
+ example_assistant: {{"sql_query":
38
+ SELECT DISTINCT "landvote_id", "year","state","county","municipal","jurisdiction", "geom", "percent_yes", "status" FROM landvote
39
+ WHERE "status" == 'Fail'
40
+ AND CAST(REPLACE(\"percent_yes\", '%', '') AS DOUBLE) BETWEEN 45 AND 50
41
+ ORDER BY "percent_yes";
42
+ "explanation":"I filtered the landvote table to county-level measures that passed and occurred in jurisdictions classified as Republican based on the most recent presidential election prior to the measure.
43
+ }}
44
+ "explanation": "I selected failed measures where the percent of yes votes was between 45% and 50%, indicating a narrow margin of defeat."
45
+
46
+ ## Example:
47
+ example_user: "Show me conservation measures that approved over $500 million"
48
+ example_assistant: {{"sql_query":
49
+ SELECT DISTINCT "landvote_id", "year","state","county","municipal","jurisdiction", "geom", "conservation_funds_approved" FROM landvote
50
+ WHERE "conservation_funds_approved"> 500000000
51
+ ORDER BY "conservation_funds_approved" DESC;
52
+ "explanation":"I filtered the landvote table to county-level measures that passed and occurred in jurisdictions classified as Republican based on the most recent presidential election prior to the measure.
53
+ }}
54
+ "explanation": "I filtered measures to those where the approved conservation funding exceeded $500 million."
55
+
56
+ ## Example:
57
+ example_user: "Which year had the most conservation funds approved?"
58
+ example_assistant: {{"sql_query":
59
+ SELECT "landvote_id", "year","state","county","municipal","jurisdiction", "geom", SUM("conservation_funds_approved") AS "total_conservation_funds_approved"
60
+ FROM landvote
61
+ GROUP BY "year",
62
+ ORDER BY "total_conservation_funds_approved" DESC LIMIT 1;
63
+ "explanation":"I took the sum of conservation funds in `landvote` by year and returned the year with the most funds.
64
+ }}
65
+
66
+ ## Example:
67
+ example_user: "How many measures passed by jurisdiction type?"
68
+ example_assistant: {{"sql_query":
69
+ SELECT "jurisdiction", COUNT(*) AS "passed_measures" FROM landvote
70
+ WHERE "status" IN ('Pass','Pass*')
71
+ GROUP BY "jurisdiction"
72
+ ORDER BY "passed_measures";
73
+ "explanation":"I grouped passed measures by jurisdiction type and counted how many measures passed in each category.
74
+ }}
75
+
76
+ ## Example:
77
+ example_user: "How often do bond measures pass compared to other finance mechanisms?"
78
+ example_assistant: {{"sql_query":
79
+ SELECT "finance_mechanism",
80
+ AVG(CASE WHEN "status" IN ('Pass', 'Pass*') THEN 1 ELSE 0 END) AS "pass_rate" FROM landvote
81
+ GROUP BY "finance_mechanism"
82
+ ORDER BY "pass_rate" DESC;
83
+ "explanation":"I calculated the average pass rate for each finance mechanism by treating passed measures as 1 and failed measures as 0, allowing a comparison of how often bond measures pass relative to other mechanisms.
84
+ }}
app/utils.py CHANGED
@@ -1,8 +1,24 @@
 
 
 
 
1
  import ibis
2
  from ibis import _
3
  import altair as alt
 
 
 
 
 
 
 
4
  from variables import *
5
 
 
 
 
 
 
6
  def get_unique_rows(df):
7
  # collapse multi-county measures to one row per landvote_id
8
  unique_votes = (
@@ -11,14 +27,42 @@ def get_unique_rows(df):
11
  .agg(
12
  **{c: ibis._[c].first() for c in df.schema().names if c not in ("landvote_id", "county", "party")},
13
  # if spans multiple counties -> set different name for county
14
- county=ibis.ifelse(ibis._.county.nunique() > 1, "Multiple Counties", ibis._.county.first()),
15
- # if counties differ in parties -> assign other label to party
16
  party=ibis.ifelse(ibis._.party.nunique() > 1, "Mixed", ibis._.party.first()),
17
  )
18
  )
19
  return unique_votes
20
 
21
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def create_chart(df, y_column, ylab, title, color, chart_type="line"):
23
  # color encoding - color is a list or single value
24
  color_encoding = (
@@ -86,6 +130,47 @@ def funding_chart(votes):
86
  )
87
 
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def party_style(year):
90
  recent_election_year = year - year % 4
91
 
@@ -116,7 +201,6 @@ def party_style(year):
116
  }
117
 
118
 
119
-
120
  # pmtiles style for status
121
  def get_status_style(jurisdiction, min_year, max_year):
122
  if jurisdiction == "State":
@@ -144,20 +228,20 @@ def get_status_style(jurisdiction, min_year, max_year):
144
  ]
145
  }
146
 
 
147
  # pmtiles style for party
148
- def get_party_landvote_style(jurisdiction, min_year, max_year):
149
  return {
150
  "layers": [
151
  {
152
- "id": jurisdiction,
153
- "source": jurisdiction,
154
  "source-layer": "landvote_party",
155
  "type": "fill",
156
  "filter": [
157
  "all",
158
  ["<=", "year", str(max_year)],
159
  [">=", "year", str(min_year)],
160
- ["==", "jurisdiction", jurisdiction],
161
  ],
162
  "paint": {
163
  "fill-color": {
@@ -174,38 +258,210 @@ def get_party_landvote_style(jurisdiction, min_year, max_year):
174
  }
175
 
176
 
177
- def party_chart(df):
178
- chart = (
179
- alt.Chart(df)
180
- .mark_line(point=True)
181
- .encode(
182
- x=alt.X("year:O", title="Year"),
183
- y=alt.Y(
184
- "pass_fraction:Q",
185
- title="% of measures passed",
186
- axis=alt.Axis(format="%"),
187
- ),
188
- color=alt.Color(
189
- "party:N",
190
- scale=alt.Scale(
191
- domain=["Democrat", "Republican"],
192
- range=[colors["dem_blue"], colors["rep_red"]],
193
- ),
194
- legend=alt.Legend(title="Party"),
195
- ),
196
- tooltip=[
197
- alt.Tooltip("year:O", title="Year"),
198
- alt.Tooltip("party:N", title="Party"),
199
- alt.Tooltip(
200
- "pass_fraction:Q",
201
- title="% passed",
202
- format=".1%",
203
- ),
204
- ],
205
- )
206
- .properties(
207
- title="Percent of Measures Passed per Year by Political Party"
208
- )
209
- )
210
 
211
- return chart
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import datetime
4
+
5
  import ibis
6
  from ibis import _
7
  import altair as alt
8
+ import minio
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from pydantic import BaseModel, Field
12
+ from langchain_openai import ChatOpenAI
13
+ from langchain_core.prompts import ChatPromptTemplate
14
+
15
  from variables import *
16
 
17
+
18
+ # -----------------------------
19
+ # Data wrangling utils
20
+ # -----------------------------
21
+
22
  def get_unique_rows(df):
23
  # collapse multi-county measures to one row per landvote_id
24
  unique_votes = (
 
27
  .agg(
28
  **{c: ibis._[c].first() for c in df.schema().names if c not in ("landvote_id", "county", "party")},
29
  # if spans multiple counties -> set different name for county
30
+ county=ibis.ifelse(ibis._.county.nunique() > 1, "Multiple Counties", ibis._.county.first()),
31
+ # if counties differ in parties -> assign other label to party
32
  party=ibis.ifelse(ibis._.party.nunique() > 1, "Mixed", ibis._.party.first()),
33
  )
34
  )
35
  return unique_votes
36
 
37
+
38
+ def get_pass_stats(df, min_year, max_year):
39
+ passed_year = (
40
+ df
41
+ .filter((_.year >= min_year) & (_.year <= max_year))
42
+ .filter(_.status.isin(["Pass", "Pass*"]))
43
+ .count()
44
+ .execute()
45
+ )
46
+ total_year = df.filter((_.year >= min_year) & (_.year <= max_year)).count().execute()
47
+ year_passed = round(passed_year / total_year * 100, 2)
48
+
49
+ # compute percentage passed over entire dataset
50
+ passed = df.filter(_.status.isin(["Pass", "Pass*"])).count().execute()
51
+ total = df.count().execute()
52
+ overall_passed = round(passed / total * 100, 2)
53
+ return year_passed, overall_passed
54
+
55
+
56
+ def extract_columns(sql_query):
57
+ # Find all substrings inside double quotes
58
+ columns = list(dict.fromkeys(re.findall(r'"(.*?)"', sql_query)))
59
+ return columns
60
+
61
+
62
+ # -----------------------------
63
+ # Chart utils
64
+ # -----------------------------
65
+
66
  def create_chart(df, y_column, ylab, title, color, chart_type="line"):
67
  # color encoding - color is a list or single value
68
  color_encoding = (
 
130
  )
131
 
132
 
133
+ def party_chart(df):
134
+ chart = (
135
+ alt.Chart(df)
136
+ .mark_line(point=True)
137
+ .encode(
138
+ x=alt.X("year:O", title="Year"),
139
+ y=alt.Y(
140
+ "pass_fraction:Q",
141
+ title="% of measures passed",
142
+ axis=alt.Axis(format="%"),
143
+ ),
144
+ color=alt.Color(
145
+ "party:N",
146
+ scale=alt.Scale(
147
+ domain=["Democrat", "Republican"],
148
+ range=[colors["dem_blue"], colors["rep_red"]],
149
+ ),
150
+ legend=alt.Legend(title="Party"),
151
+ ),
152
+ tooltip=[
153
+ alt.Tooltip("year:O", title="Year"),
154
+ alt.Tooltip("party:N", title="Party"),
155
+ alt.Tooltip(
156
+ "pass_fraction:Q",
157
+ title="% passed",
158
+ format=".1%",
159
+ ),
160
+ ],
161
+ )
162
+ .properties(
163
+ title="Percent of Measures Passed per Year by Political Party"
164
+ )
165
+ )
166
+
167
+ return chart
168
+
169
+
170
+ # -----------------------------
171
+ # Mapping / style utils
172
+ # -----------------------------
173
+
174
  def party_style(year):
175
  recent_election_year = year - year % 4
176
 
 
201
  }
202
 
203
 
 
204
  # pmtiles style for status
205
  def get_status_style(jurisdiction, min_year, max_year):
206
  if jurisdiction == "State":
 
228
  ]
229
  }
230
 
231
+
232
  # pmtiles style for party
233
+ def get_party_landvote_style(min_year, max_year):
234
  return {
235
  "layers": [
236
  {
237
+ "id": "party",
238
+ "source": "landvote",
239
  "source-layer": "landvote_party",
240
  "type": "fill",
241
  "filter": [
242
  "all",
243
  ["<=", "year", str(max_year)],
244
  [">=", "year", str(min_year)],
 
245
  ],
246
  "paint": {
247
  "fill-color": {
 
258
  }
259
 
260
 
261
+ def llm_pmtiles_style(ids, paint, pmtiles):
262
+ source_layer_name = re.sub(r"\W+", "", os.path.splitext(os.path.basename(pmtiles))[0]) #stripping hyphens to get layer name
263
+ ids = [str(x) for x in ids]
264
+ style = {
265
+ "version": 8,
266
+ "sources": {
267
+ "tpl": {
268
+ "type": "vector",
269
+ "url": "pmtiles://" + pmtiles,
270
+ "attribution": "TPL",
271
+ },
272
+ },
273
+ "layers": [
274
+ {
275
+ "id": "tpl",
276
+ "source": "tpl",
277
+ "source-layer": source_layer_name,
278
+ "type": "fill",
279
+ "filter": ["in", ["get", "landvote_id"], ["literal", ids]],
280
+ "paint": paint,
281
+ }
282
+ ],
283
+ }
284
+ return style
 
 
 
 
 
 
 
 
 
285
 
286
+
287
+ @st.cache_resource(show_spinner=False)
288
+ def get_con(db_path: str = "duck.db"):
289
+ return ibis.duckdb.connect(db_path, extensions=["spatial"])
290
+
291
+
292
+ # -----------------------------
293
+ # Chatbot utils
294
+ # -----------------------------
295
+
296
+ class SQLResponse(BaseModel):
297
+ """Defines the structure for SQL response."""
298
+ sql_query: str = Field(description="The SQL query generated by the assistant.")
299
+ explanation: str = Field(description="A detailed explanation of how the SQL query answers the input question.")
300
+
301
+
302
+ @st.cache_data(show_spinner=False)
303
+ def _load_template(path: str = "app/system_prompt.txt") -> str:
304
+ with open(path, "r") as f:
305
+ return f.read()
306
+
307
+
308
+ def make_run_sql(votes, llm, con, template_path: str = "app/system_prompt.txt"):
309
+ """
310
+ Returns a run_sql(query, llm_choice) function that:
311
+ - closes over `con` and the chain
312
+ - uses @st.cache_data exactly like your app.py version
313
+ """
314
+
315
+ template = _load_template(template_path)
316
+
317
+ prompt_tmpl = ChatPromptTemplate.from_messages([
318
+ ("system", template),
319
+ ("human", "{input}")
320
+ ]).partial(dialect="duckdb", landvote=votes.schema())
321
+
322
+ # Ensure tools/structured output is not streaming
323
+ llm = llm.bind(streaming=False)
324
+
325
+ structured_llm = llm.with_structured_output(SQLResponse)
326
+ few_shot_structured_llm = prompt_tmpl | structured_llm
327
+
328
+ @st.cache_data(show_spinner=False)
329
+ def run_sql(query: str, llm_choice: str):
330
+ output = few_shot_structured_llm.invoke({"input": query})
331
+ sql_query = output.sql_query
332
+ explanation = output.explanation
333
+
334
+ if not sql_query:
335
+ return pd.DataFrame({"landvote_id": []}), "", explanation
336
+
337
+ result = con.sql(sql_query).distinct().execute()
338
+
339
+ if result.empty:
340
+ explanation = "This query did not return any results. Please try again with a different query."
341
+ if "geom" in result.columns:
342
+ return result.drop("geom", axis=1), sql_query, explanation
343
+ return result, sql_query, explanation
344
+
345
+ return result, sql_query, explanation
346
+
347
+ return run_sql
348
+
349
+
350
+ def handle_llm_query(
351
+ prompt: str,
352
+ llm_choice: str,
353
+ run_sql_fn,
354
+ log_queries: bool,
355
+ logger_fn,
356
+ log_file: str = "landvote_query_log.csv",
357
+ log_bucket: str = "shared-tpl",
358
+ ):
359
+ """
360
+ Runs the LLM->SQL pipeline, renders Streamlit output, logs the query,
361
+ and returns mapping-relevant outputs.
362
+ """
363
+
364
+ not_mapping = False
365
+ unique_ids, llm_cols, llm_bounds = [], [], None
366
+
367
+ if not prompt:
368
+ return {
369
+ "llm_output": None,
370
+ "sql_query": "",
371
+ "llm_explanation": "",
372
+ "unique_ids": [],
373
+ "llm_cols": [],
374
+ "llm_bounds": None,
375
+ "not_mapping": True,
376
+ }
377
+
378
+ st.chat_message("user").write(prompt)
379
+
380
+ with st.chat_message("assistant"):
381
+ with st.spinner("Invoking query..."):
382
+ llm_output, sql_query, llm_explanation = run_sql_fn(prompt, llm_choice)
383
+
384
+ # Log (keep your exact signature)
385
+ logger_fn(
386
+ log_queries,
387
+ prompt,
388
+ sql_query,
389
+ llm_explanation,
390
+ llm_choice,
391
+ log_file,
392
+ log_bucket,
393
+ )
394
+
395
+ # No SQL generated
396
+ if sql_query == "":
397
+ st.success(llm_explanation)
398
+ not_mapping = True
399
+
400
+ else:
401
+ # SQL generated but no results
402
+ if llm_output is not None and llm_output.empty:
403
+ st.warning(llm_explanation, icon="⚠️")
404
+ st.caption("SQL Query:")
405
+ st.code(sql_query, language="sql")
406
+ st.stop()
407
+
408
+ # Output without mapping columns
409
+ elif llm_output is not None and ("landvote_id" not in llm_output.columns and "geom" not in llm_output.columns):
410
+ st.write(llm_output)
411
+ not_mapping = True
412
+
413
+ # Always show explanation + SQL in a popover
414
+ with st.popover("Explanation"):
415
+ st.write(llm_explanation)
416
+ st.caption("SQL Query:")
417
+ st.code(sql_query, language="sql")
418
+
419
+ # Extract ids, columns, bounds if present
420
+ if llm_output is not None and ("landvote_id" in llm_output.columns) and (not llm_output.empty):
421
+ unique_ids = list(set(llm_output["landvote_id"].tolist()))
422
+ llm_cols = extract_columns(sql_query)
423
+ llm_bounds = llm_output.total_bounds.tolist()
424
+ else:
425
+ unique_ids, llm_cols, llm_bounds = [], [], None
426
+ not_mapping = True
427
+
428
+ return {
429
+ "llm_output": llm_output,
430
+ "sql_query": sql_query,
431
+ "llm_explanation": llm_explanation,
432
+ "unique_ids": unique_ids,
433
+ "llm_cols": llm_cols,
434
+ "llm_bounds": llm_bounds,
435
+ "not_mapping": not_mapping,
436
+ }
437
+
438
+
439
+ # -----------------------------
440
+ # Logging utils
441
+ # -----------------------------
442
+
443
+ minio_key = os.getenv("MINIO_KEY")
444
+ if minio_key is None:
445
+ minio_key = st.secrets["MINIO_KEY"]
446
+
447
+ minio_secret = os.getenv("MINIO_SECRET")
448
+ if minio_secret is None:
449
+ minio_secret = st.secrets["MINIO_SECRET"]
450
+
451
+
452
+ def minio_logger(consent, query, sql_query, llm_explanation, llm_choice, filename="landvote_query_log.csv", bucket="shared-tpl",
453
+ key=minio_key, secret=minio_secret,
454
+ endpoint="minio.carlboettiger.info"):
455
+ mc = minio.Minio(endpoint, key, secret)
456
+ mc.fget_object(bucket, filename, filename)
457
+ log = pd.read_csv(filename)
458
+ timestamp = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
459
+ if consent:
460
+ df = pd.DataFrame({"timestamp": [timestamp], "user_query": [query], "llm_sql": [sql_query], "llm_explanation": [llm_explanation], "llm_choice":[llm_choice]})
461
+
462
+ # if user opted out, do not store query
463
+ else:
464
+ df = pd.DataFrame({"timestamp": [timestamp], "user_query": ['USER OPTED OUT'], "llm_sql": [''], "llm_explanation": [''], "llm_choice":['']})
465
+
466
+ pd.concat([log,df]).to_csv(filename, index=False, header=True)
467
+ mc.fput_object(bucket, filename, filename, content_type="text/csv")
app/variables.py CHANGED
@@ -124,3 +124,62 @@ party_pmtiles = (
124
  "https://minio.carlboettiger.info/public-election/"
125
  "county/county_political_parties_1988-2024.pmtiles"
126
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  "https://minio.carlboettiger.info/public-election/"
125
  "county/county_political_parties_1988-2024.pmtiles"
126
  )
127
+
128
+
129
+ from langchain_openai import ChatOpenAI
130
+ import streamlit as st
131
+ from langchain_openai.chat_models.base import BaseChatOpenAI
132
+
133
+ ## dockerized streamlit app wants to read from os.getenv(), otherwise use st.secrets
134
+ import os
135
+ api_key = os.getenv("NRP_API_KEY")
136
+ if api_key is None:
137
+ api_key = st.secrets["NRP_API_KEY"]
138
+
139
+ openrouter_api = os.getenv("OPENROUTER_API_KEY")
140
+ if openrouter_api is None:
141
+ openrouter_api = st.secrets["OPENROUTER_API_KEY"]
142
+
143
+ openrouter_endpoint="https://openrouter.ai/api/v1"
144
+ nrp_endpoint="https://ellm.nrp-nautilus.io/v1"
145
+
146
+ # don't use a provider that collects data
147
+ data_policy = {
148
+ "provider": {
149
+ "data_collection": "deny"
150
+ }
151
+ }
152
+
153
+ llm_options = {
154
+ "devstral-2512": ChatOpenAI(
155
+ model="mistralai/devstral-2512:free",
156
+ api_key=openrouter_api,
157
+ base_url=openrouter_endpoint,
158
+ temperature=0,
159
+ extra_body=data_policy
160
+ ),
161
+
162
+ "trinity-mini": ChatOpenAI(
163
+ model="arcee-ai/trinity-mini:free",
164
+ api_key=openrouter_api,
165
+ base_url=openrouter_endpoint,
166
+ temperature=0,
167
+ extra_body=data_policy
168
+ ),
169
+
170
+ "nemotron-nano-9b-v2": ChatOpenAI(
171
+ model="nvidia/nemotron-nano-9b-v2:free",
172
+ api_key=openrouter_api,
173
+ base_url=openrouter_endpoint,
174
+ temperature=0,
175
+ extra_body=data_policy
176
+ ),
177
+
178
+ "gemma-3-27b-it": ChatOpenAI(
179
+ model="gemma3",
180
+ api_key=api_key,
181
+ base_url=nrp_endpoint,
182
+ temperature=0
183
+ ),
184
+
185
+ }
requirements.txt CHANGED
@@ -4,7 +4,11 @@ duckdb==1.2.2
4
  duckdb_engine== 0.15.0
5
  geoarrow-types==0.2.0
6
  geoarrow-pandas==0.1.1
7
- ibis-framework[duckdb]==10.3.1
 
 
 
 
8
  leafmap==0.53.3
9
  minio==7.2.15
10
  maplibre==0.3.3
 
4
  duckdb_engine== 0.15.0
5
  geoarrow-types==0.2.0
6
  geoarrow-pandas==0.1.1
7
+ ibis-framework==10.3.1
8
+ langchain==0.2.17
9
+ langchain-community==0.2.19
10
+ langchain-core==0.2.43
11
+ langchain-openai==0.1.25
12
  leafmap==0.53.3
13
  minio==7.2.15
14
  maplibre==0.3.3