Spaces:

boettiger-lab
/

landvote

Running

App Files Files Community

Cassie Buhler commited on Jan 15

Commit

9aa9417

unverified ·

2 Parent(s): 6d5acd0 72c19d9

Merge pull request #4 from boettiger-lab/patch/testing

Browse files

Files changed (6) hide show

app/app.py +148 -62
app/footer.md +12 -0
app/system_prompt.txt +84 -0
app/utils.py +298 -42
app/variables.py +59 -0
requirements.txt +5 -1

app/app.py CHANGED Viewed

@@ -1,15 +1,13 @@
 import os
 import altair as alt
-import ibis
 import leafmap.maplibregl as leafmap
 import matplotlib.pyplot as plt
 import pandas as pd
 import streamlit as st
-from ibis import _
 from utils import *
 st.set_page_config(
     layout="wide",
@@ -32,22 +30,19 @@ st.caption(
 "ℹ️ Tip: Use the slider to change the year and hover over shaded areas for measure details."
 min_year, max_year = st.slider("Select a range", 1988, 2024, (2020, 2024))
 con = ibis.duckdb.connect("duck.db", extensions=["spatial"])
 current_tables = con.list_tables()
-if "mydata" not in set(current_tables):
     tbl = (
         con.read_parquet(votes_parquet)
         .cast({"geom": "geometry"})
     )
     tbl = get_unique_rows(tbl)  # drop multi-county measures with non-unanimous party labels
-    con.create_table("mydata", tbl)
-votes = con.table("mydata")
 with st.sidebar:
     color_choice = st.radio("Color by:", ["Measure status", "Political Party"])
@@ -58,6 +53,68 @@ with st.sidebar:
     social_toggle = st.toggle("Social Vulnerability Index")
     justice_toggle = st.toggle("Climate and Economic Justic")
 m = leafmap.Map(
     style="positron",
@@ -95,69 +152,98 @@ if justice_toggle:
     )
-# compute percentage passed in given year
-passed_year = (
-    votes
-    .filter((_.year>= min_year) & (_.year<= max_year))
-    .filter(_.status.isin(["Pass", "Pass*"]))
-    .count()
-    .execute()
-)
-total_year = votes.filter((_.year>= min_year) & (_.year<= max_year)).count().execute()
-year_passed = round(passed_year / total_year * 100, 2)
-f"{year_passed}% Measures Passed between {min_year} and {max_year}"
-# compute percentage passed over entire dataset
-passed = votes.filter(_.status.isin(["Pass", "Pass*"])).count().execute()
-total = votes.count().execute()
-overall_passed = round(passed / total * 100, 2)
-f"{overall_passed}% Measures Passed from 1988 - 2024 \n"
-if color_choice == "Measure status":
-    for j, o in zip(
-        ["State", "County", "Municipal", "Special District"],
-        [0.8, 1, 1, 1],
-    ):
         m.add_pmtiles(
             votes_pmtiles,
-            style=get_status_style(j,min_year,max_year),
             visible=True,
-            opacity=o,
             tooltip=True,
         )
-elif color_choice == "Political Party":
-    for j, o in zip(
-        ["State", "County", "Municipal", "Special District"],
-        [0.8, 1, 1, 1],
-    ):
         m.add_pmtiles(
             votes_pmtiles,
-            style=get_party_landvote_style(j,min_year,max_year),
             visible=True,
-            opacity=o,
             tooltip=True,
         )
-m.add_layer_control()
-m.to_streamlit()
-party_df = get_party_df(votes)
-st.altair_chart(party_chart(party_df), use_container_width=True)
-df_funding = funding_chart(votes)
-st.altair_chart(
-    create_chart(
-        df_funding,
-        "cumulative_funding",
-        "Billions of Dollars",
-        "Cumulative Funding",
-        colors["dark_green"],
-        chart_type="bar",
-    ),
-    use_container_width=True,
-)
 st.divider()
 st.caption(

 import os
+import openai
 import altair as alt
 import leafmap.maplibregl as leafmap
 import matplotlib.pyplot as plt
 import pandas as pd
 import streamlit as st
 from utils import *
+import ibis
+from ibis import _
 st.set_page_config(
     layout="wide",
 "ℹ️ Tip: Use the slider to change the year and hover over shaded areas for measure details."
 min_year, max_year = st.slider("Select a range", 1988, 2024, (2020, 2024))
 con = ibis.duckdb.connect("duck.db", extensions=["spatial"])
 current_tables = con.list_tables()
+if "landvote" not in set(current_tables):
     tbl = (
         con.read_parquet(votes_parquet)
         .cast({"geom": "geometry"})
     )
     tbl = get_unique_rows(tbl)  # drop multi-county measures with non-unanimous party labels
+    con.create_table("landvote", tbl)
+votes = con.table("landvote")
 with st.sidebar:
     color_choice = st.radio("Color by:", ["Measure status", "Political Party"])
     social_toggle = st.toggle("Social Vulnerability Index")
     justice_toggle = st.toggle("Climate and Economic Justic")
+##### Chatbot stuff
+chatbot_container = st.container()
+with chatbot_container:
+    llm_left_col, llm_right_col = st.columns([5,1], vertical_alignment = "bottom")
+    with llm_left_col:
+        with st.popover("💬 Example Queries"):
+            '''
+            Mapping queries:
+            - Show me Republican-voting counties where conservation measures passed
+            - Show measures that failed narrowly (between 45% and 50% yes)
+            - Show me conservation measures that approved over $500 million
+            '''
+            '''
+            Exploratory data queries:
+            - Which year had the most conservation funds approved?
+            - Which state approved the largest total conservation funding?
+            - How many measures passed by jurisdiction type?
+            - Which counties voted on conservation measures most frequently?
+            - What is the median funding amount for passed measures?
+            - How often do bond measures pass compared to other finance mechanisms?
+            '''
+            st.info('If the map appears blank, queried data may be too small to see at the default zoom level. Check the table below the map, as query results will also be displayed there.', icon="ℹ️")
+    with llm_right_col:
+        llm_choice = st.selectbox("Select LLM:", llm_options, key = "llm", help = "Select which model to use.")
+        llm = llm_options[llm_choice]
+run_sql = make_run_sql(votes, llm, con)
+with chatbot_container:
+    with llm_left_col:
+        example_query = "👋 Input query here"
+        prompt = st.chat_input(example_query, key="chain", max_chars=300)
+    _,log_query_col, _ = st.columns([.001, 5,1], vertical_alignment = "top")
+    with log_query_col:
+        log_queries = st.checkbox("Save query", value = True, help = "Saving your queries helps improve this tool and guide conservation efforts. Your data is stored in a private location. For more details, see 'Why save your queries?' at the bottom of this page.")
+# new container for output so it doesn't mess with the alignment of llm options
+with st.container():
+    if prompt:
+        result = handle_llm_query(
+            prompt=prompt,
+            llm_choice=llm_choice,
+            run_sql_fn=run_sql,            # your cached function: run_sql(prompt, llm_choice)
+            log_queries=log_queries,
+            logger_fn=minio_logger,
+            log_file="landvote_query_log.csv",
+            log_bucket="shared-tpl",
+        )
+        llm_output = result["llm_output"]
+        sql_query = result["sql_query"]
+        llm_explanation = result["llm_explanation"]
+        unique_ids = result["unique_ids"]
+        llm_cols = result["llm_cols"]
+        llm_bounds = result["llm_bounds"]
+        not_mapping = result["not_mapping"]
+##### end of chatbot code
 m = leafmap.Map(
     style="positron",
     )
+# define PMTiles style dict (if we didn't already do so using the chatbot)
+if 'llm_output' in locals():
+    if not_mapping == False:
+        # filter to ids from result
+        style = llm_pmtiles_style(unique_ids, paint_fill, votes_pmtiles)
         m.add_pmtiles(
             votes_pmtiles,
+            style=style,
             visible=True,
+            opacity=1.0,
             tooltip=True,
+            name="LLM Query Results",
         )
+        # Zoom to result bounds if present
+        if "llm_bounds" in locals() and llm_bounds:
+            m.fit_bounds(llm_bounds)
+        m.to_streamlit()
+        with st.expander("🔍 View/download data"): # adding data table
+            if ('geom' in llm_output.columns) and (not llm_output.empty):
+                llm_output = llm_output.drop('geom',axis = 1)
+            st.dataframe(llm_output, use_container_width = True)
+else: # if we didn't use chatbot
+    # compute percentage passed in given year
+    year_passed, overall_passed=get_pass_stats(votes, min_year, max_year)
+    f"{year_passed}% Measures Passed between {min_year} and {max_year}"
+    f"{overall_passed}% Measures Passed from 1988 - 2024 \n"
+    if color_choice == "Measure status":
+        # 4 styles / 4 layers (jurisdiction-specific)
+        for j, o in zip(
+            ["State", "County", "Municipal", "Special District"],
+            [0.8, 1, 1, 1],
+        ):
+            m.add_pmtiles(
+                votes_pmtiles,
+                style=get_status_style(j, min_year, max_year),
+                visible=True,
+                opacity=o,
+                tooltip=True,
+                name=j,  # shows as separate toggles in layer control
+            )
+        m.to_streamlit()
+    elif color_choice == "Political Party":
+        # 1 style / 1 layer
+        style = get_party_landvote_style(min_year, max_year)
         m.add_pmtiles(
             votes_pmtiles,
+            style=style,
             visible=True,
+            opacity=1.0,
             tooltip=True,
+            name="Political Party",
         )
+        m.to_streamlit()
+    with st.expander("🔍 View/download data"): # adding data table
+        group_cols = ['landvote_id','year','state','county','municipal','jurisdiction']
+        gdf_grouped = (votes.head(100).execute().groupby(group_cols)
+            .agg({col: ('sum' if col in ['total_funds_at_stake','total_funds_approved',
+                'conservation_funds_at_stake','conservation_funds_approved'] else 'first')
+                  for col in votes.columns if col not in group_cols})).reset_index()
+        cols = ['landvote_id','year','state','county','municipal','jurisdiction',
+                'status', 'percent_yes', 'percent_no', 'date',
+                'total_funds_at_stake','total_funds_approved',
+                'conservation_funds_at_stake','conservation_funds_approved',
+                'finance_mechanism', 'other_comment','purpose',
+                'description', 'notes', 'voted_acq_measure', 'party']
+        st.dataframe(gdf_grouped[cols], use_container_width = True)
+    party_df = get_party_df(votes)
+    st.altair_chart(party_chart(party_df), use_container_width=True)
+    df_funding = funding_chart(votes)
+    st.altair_chart(
+        create_chart(
+            df_funding,
+            "cumulative_funding",
+            "Billions of Dollars",
+            "Cumulative Funding",
+            colors["dark_green"],
+            chart_type="bar",
+        ),
+        use_container_width=True,
+    )
 st.divider()
 st.caption(

app/footer.md CHANGED Viewed

	@@ -24,3 +24,15 @@ For details on methodology, please refer to the our data processing code for [La
24	- CDC 2020 Social Vulnerability Index by US Census Track. Data: https://source.coop/repositories/cboettig/social-vulnerability/description. License: Public Domain
25
26

 - CDC 2020 Social Vulnerability Index by US Census Track. Data: https://source.coop/repositories/cboettig/social-vulnerability/description. License: Public Domain
+#### LLMs
+This app can use a selection of open-weights language models hosted on the National Research Platform (https://nrp.ai/documentation/userdocs/ai/llm-managed/), and Open Router (https://openrouter.ai/models).
+---
+### Why save your queries?
+Conservation researchers and practitioners are interested in **learning what matters most to our community**.
+By saving your anonymous queries, we can identify which topics and areas are drawing the most attention, helping us improve future tools and data products to **better support conservation efforts**. We also save the LLM’s response to each query to monitor its accuracy and ensure the system is working as intended.
+You can opt out at any time by disabling “Save query”.

app/system_prompt.txt ADDED Viewed

	@@ -0,0 +1,84 @@

+You are an expert in SQL and an assistant for mapping and analyzing the Trust for Public Land (TPL) LandVote data.  You are provided multiple tables and must identify which table(s) to use. Given an input question, create a syntactically correct {dialect} query to run, and then provide an explanation of how you answered the input question. Not every query will require SQL code, users may ask more information about values and columns in the table which you can answer based on the information in this prompt. For these cases, your "sql_query" field should be empty.
+ONLY write SQL queries using the records and columns that exist in the relevant table. You have access to this table:
+landvote:
+- Definition: Tracks land conservation ballot measures since 1988.
+- Schema: {landvote}
+For example:
+{{
+  "sql_query": "SELECT cols FROM mydata WHERE condition = 'value';",
+  "explanation": "This query retrieves columns from my_table where the condition column equals 'value'."
+}}
+Ensure the response contains only this JSON object, with no additional text, formatting, or commentary.
+# Important Details
+    - For visualization-related queries (e.g., "show me"), ALWAYS include "landvote_id", "year","state","county","municipal","jurisdiction", and "geom" in the results,
+    - Wrap each column name in double quotes (") to denote them as delimited identifiers.
+    - Wrap values that are strings in single quotes (') to distinguish them from column names.
+# Example Questions and How to Approach Them
+## Example:
+example_user: "Show me measures that passed in Republican-voting counties"
+example_assistant: {{"sql_query":
+    SELECT DISTINCT "landvote_id", "year","state","county","municipal","jurisdiction", "geom", "party", "status" FROM landvote
+    WHERE "party" == 'Republican'
+    AND "jurisdiction"=='County'
+    AND "status" IN ('Pass','Pass*')
+    ORDER BY "year";
+"explanation":"I filtered the landvote table to county-level measures that passed and occurred in jurisdictions classified as Republican based on the most recent presidential election prior to the measure.
+}}
+## Example:
+example_user: "Show me measures that failed narrowly (between 45% and 50% yes)"
+example_assistant: {{"sql_query":
+    SELECT DISTINCT "landvote_id", "year","state","county","municipal","jurisdiction", "geom", "percent_yes", "status" FROM landvote
+    WHERE "status" == 'Fail'
+    AND CAST(REPLACE(\"percent_yes\", '%', '') AS DOUBLE) BETWEEN 45 AND 50
+    ORDER BY "percent_yes";
+"explanation":"I filtered the landvote table to county-level measures that passed and occurred in jurisdictions classified as Republican based on the most recent presidential election prior to the measure.
+}}
+  "explanation": "I selected failed measures where the percent of yes votes was between 45% and 50%, indicating a narrow margin of defeat."
+## Example:
+example_user: "Show me conservation measures that approved over $500 million"
+example_assistant: {{"sql_query":
+    SELECT DISTINCT "landvote_id", "year","state","county","municipal","jurisdiction", "geom", "conservation_funds_approved" FROM landvote
+    WHERE "conservation_funds_approved"> 500000000
+    ORDER BY "conservation_funds_approved" DESC;
+"explanation":"I filtered the landvote table to county-level measures that passed and occurred in jurisdictions classified as Republican based on the most recent presidential election prior to the measure.
+}}
+  "explanation": "I filtered measures to those where the approved conservation funding exceeded $500 million."
+## Example:
+example_user: "Which year had the most conservation funds approved?"
+example_assistant: {{"sql_query":
+    SELECT "landvote_id", "year","state","county","municipal","jurisdiction", "geom", SUM("conservation_funds_approved") AS "total_conservation_funds_approved"
+    FROM landvote
+    GROUP BY "year",
+    ORDER BY "total_conservation_funds_approved" DESC LIMIT 1;
+"explanation":"I took the sum of conservation funds in `landvote` by year and returned the year with the most funds.
+}}
+## Example:
+example_user: "How many measures passed by jurisdiction type?"
+example_assistant: {{"sql_query":
+    SELECT "jurisdiction", COUNT(*) AS "passed_measures" FROM landvote
+    WHERE "status" IN ('Pass','Pass*')
+    GROUP BY "jurisdiction"
+    ORDER BY "passed_measures";
+"explanation":"I grouped passed measures by jurisdiction type and counted how many measures passed in each category.
+}}
+## Example:
+example_user: "How often do bond measures pass compared to other finance mechanisms?"
+example_assistant: {{"sql_query":
+    SELECT "finance_mechanism",
+    AVG(CASE WHEN "status" IN ('Pass', 'Pass*') THEN 1 ELSE 0 END) AS "pass_rate" FROM landvote
+    GROUP BY "finance_mechanism"
+    ORDER BY "pass_rate" DESC;
+"explanation":"I calculated the average pass rate for each finance mechanism by treating passed measures as 1 and failed measures as 0, allowing a comparison of how often bond measures pass relative to other mechanisms.
+}}

app/utils.py CHANGED Viewed

@@ -1,8 +1,24 @@
 import ibis
 from ibis import _
 import altair as alt
 from variables import *
 def get_unique_rows(df):
     # collapse multi-county measures to one row per landvote_id
     unique_votes = (
@@ -11,14 +27,42 @@ def get_unique_rows(df):
         .agg(
             **{c: ibis._[c].first() for c in df.schema().names if c not in ("landvote_id", "county", "party")},
             # if spans multiple counties -> set different name for county
-            county=ibis.ifelse(ibis._.county.nunique() > 1, "Multiple Counties", ibis._.county.first()),
-             # if counties differ in parties -> assign other label to party
             party=ibis.ifelse(ibis._.party.nunique() > 1, "Mixed", ibis._.party.first()),
         )
     )
     return unique_votes
 def create_chart(df, y_column, ylab, title, color, chart_type="line"):
     # color encoding - color is a list or single value
     color_encoding = (
@@ -86,6 +130,47 @@ def funding_chart(votes):
     )
 def party_style(year):
     recent_election_year = year - year % 4
@@ -116,7 +201,6 @@ def party_style(year):
     }
 # pmtiles style for status
 def get_status_style(jurisdiction, min_year, max_year):
     if jurisdiction == "State":
@@ -144,20 +228,20 @@ def get_status_style(jurisdiction, min_year, max_year):
         ]
     }
 # pmtiles style for party
-def get_party_landvote_style(jurisdiction, min_year, max_year):
     return {
         "layers": [
             {
-                "id": jurisdiction,
-                "source": jurisdiction,
                 "source-layer": "landvote_party",
                 "type": "fill",
                 "filter": [
                     "all",
                     ["<=", "year", str(max_year)],
                     [">=", "year", str(min_year)],
-                    ["==", "jurisdiction", jurisdiction],
                 ],
                 "paint": {
                     "fill-color": {
@@ -174,38 +258,210 @@ def get_party_landvote_style(jurisdiction, min_year, max_year):
     }
-def party_chart(df):
-    chart = (
-        alt.Chart(df)
-        .mark_line(point=True)
-        .encode(
-            x=alt.X("year:O", title="Year"),
-            y=alt.Y(
-                "pass_fraction:Q",
-                title="% of measures passed",
-                axis=alt.Axis(format="%"),
-            ),
-            color=alt.Color(
-                "party:N",
-                scale=alt.Scale(
-                    domain=["Democrat", "Republican"],
-                    range=[colors["dem_blue"], colors["rep_red"]],
-                ),
-                legend=alt.Legend(title="Party"),
-            ),
-            tooltip=[
-                alt.Tooltip("year:O", title="Year"),
-                alt.Tooltip("party:N", title="Party"),
-                alt.Tooltip(
-                    "pass_fraction:Q",
-                    title="% passed",
-                    format=".1%",
-                ),
-            ],
-        )
-        .properties(
-            title="Percent of Measures Passed per Year by Political Party"
-        )
-    )
-    return chart

+import os
+import re
+import datetime
 import ibis
 from ibis import _
 import altair as alt
+import minio
+import pandas as pd
+import streamlit as st
+from pydantic import BaseModel, Field
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
 from variables import *
+# -----------------------------
+# Data wrangling utils
+# -----------------------------
 def get_unique_rows(df):
     # collapse multi-county measures to one row per landvote_id
     unique_votes = (
         .agg(
             **{c: ibis._[c].first() for c in df.schema().names if c not in ("landvote_id", "county", "party")},
             # if spans multiple counties -> set different name for county
+            county=ibis.ifelse(ibis._.county.nunique() > 1, "Multiple Counties", ibis._.county.first()),
+             # if counties differ in parties -> assign other label to party
             party=ibis.ifelse(ibis._.party.nunique() > 1, "Mixed", ibis._.party.first()),
         )
     )
     return unique_votes
+def get_pass_stats(df, min_year, max_year):
+    passed_year = (
+        df
+        .filter((_.year >= min_year) & (_.year <= max_year))
+        .filter(_.status.isin(["Pass", "Pass*"]))
+        .count()
+        .execute()
+    )
+    total_year = df.filter((_.year >= min_year) & (_.year <= max_year)).count().execute()
+    year_passed = round(passed_year / total_year * 100, 2)
+    # compute percentage passed over entire dataset
+    passed = df.filter(_.status.isin(["Pass", "Pass*"])).count().execute()
+    total = df.count().execute()
+    overall_passed = round(passed / total * 100, 2)
+    return year_passed, overall_passed
+def extract_columns(sql_query):
+    # Find all substrings inside double quotes
+    columns = list(dict.fromkeys(re.findall(r'"(.*?)"', sql_query)))
+    return columns
+# -----------------------------
+# Chart utils
+# -----------------------------
 def create_chart(df, y_column, ylab, title, color, chart_type="line"):
     # color encoding - color is a list or single value
     color_encoding = (
     )
+def party_chart(df):
+    chart = (
+        alt.Chart(df)
+        .mark_line(point=True)
+        .encode(
+            x=alt.X("year:O", title="Year"),
+            y=alt.Y(
+                "pass_fraction:Q",
+                title="% of measures passed",
+                axis=alt.Axis(format="%"),
+            ),
+            color=alt.Color(
+                "party:N",
+                scale=alt.Scale(
+                    domain=["Democrat", "Republican"],
+                    range=[colors["dem_blue"], colors["rep_red"]],
+                ),
+                legend=alt.Legend(title="Party"),
+            ),
+            tooltip=[
+                alt.Tooltip("year:O", title="Year"),
+                alt.Tooltip("party:N", title="Party"),
+                alt.Tooltip(
+                    "pass_fraction:Q",
+                    title="% passed",
+                    format=".1%",
+                ),
+            ],
+        )
+        .properties(
+            title="Percent of Measures Passed per Year by Political Party"
+        )
+    )
+    return chart
+# -----------------------------
+# Mapping / style utils
+# -----------------------------
 def party_style(year):
     recent_election_year = year - year % 4
     }
 # pmtiles style for status
 def get_status_style(jurisdiction, min_year, max_year):
     if jurisdiction == "State":
         ]
     }
 # pmtiles style for party
+def get_party_landvote_style(min_year, max_year):
     return {
         "layers": [
             {
+                "id": "party",
+                "source": "landvote",
                 "source-layer": "landvote_party",
                 "type": "fill",
                 "filter": [
                     "all",
                     ["<=", "year", str(max_year)],
                     [">=", "year", str(min_year)],
                 ],
                 "paint": {
                     "fill-color": {
     }
+def llm_pmtiles_style(ids, paint, pmtiles):
+    source_layer_name = re.sub(r"\W+", "", os.path.splitext(os.path.basename(pmtiles))[0]) #stripping hyphens to get layer name
+    ids = [str(x) for x in ids]
+    style = {
+        "version": 8,
+        "sources": {
+            "tpl": {
+                "type": "vector",
+                "url": "pmtiles://" + pmtiles,
+                "attribution": "TPL",
+            },
+        },
+        "layers": [
+            {
+                "id": "tpl",
+                "source": "tpl",
+                "source-layer": source_layer_name,
+                "type": "fill",
+                "filter": ["in", ["get", "landvote_id"], ["literal", ids]],
+                "paint": paint,
+            }
+        ],
+    }
+    return style
+@st.cache_resource(show_spinner=False)
+def get_con(db_path: str = "duck.db"):
+    return ibis.duckdb.connect(db_path, extensions=["spatial"])
+# -----------------------------
+# Chatbot utils
+# -----------------------------
+class SQLResponse(BaseModel):
+    """Defines the structure for SQL response."""
+    sql_query: str = Field(description="The SQL query generated by the assistant.")
+    explanation: str = Field(description="A detailed explanation of how the SQL query answers the input question.")
+@st.cache_data(show_spinner=False)
+def _load_template(path: str = "app/system_prompt.txt") -> str:
+    with open(path, "r") as f:
+        return f.read()
+def make_run_sql(votes, llm, con, template_path: str = "app/system_prompt.txt"):
+    """
+    Returns a run_sql(query, llm_choice) function that:
+    - closes over `con` and the chain
+    - uses @st.cache_data exactly like your app.py version
+    """
+    template = _load_template(template_path)
+    prompt_tmpl = ChatPromptTemplate.from_messages([
+        ("system", template),
+        ("human", "{input}")
+    ]).partial(dialect="duckdb", landvote=votes.schema())
+    # Ensure tools/structured output is not streaming
+    llm = llm.bind(streaming=False)
+    structured_llm = llm.with_structured_output(SQLResponse)
+    few_shot_structured_llm = prompt_tmpl | structured_llm
+    @st.cache_data(show_spinner=False)
+    def run_sql(query: str, llm_choice: str):
+        output = few_shot_structured_llm.invoke({"input": query})
+        sql_query = output.sql_query
+        explanation = output.explanation
+        if not sql_query:
+            return pd.DataFrame({"landvote_id": []}), "", explanation
+        result = con.sql(sql_query).distinct().execute()
+        if result.empty:
+            explanation = "This query did not return any results. Please try again with a different query."
+            if "geom" in result.columns:
+                return result.drop("geom", axis=1), sql_query, explanation
+            return result, sql_query, explanation
+        return result, sql_query, explanation
+    return run_sql
+def handle_llm_query(
+    prompt: str,
+    llm_choice: str,
+    run_sql_fn,
+    log_queries: bool,
+    logger_fn,
+    log_file: str = "landvote_query_log.csv",
+    log_bucket: str = "shared-tpl",
+):
+    """
+    Runs the LLM->SQL pipeline, renders Streamlit output, logs the query,
+    and returns mapping-relevant outputs.
+    """
+    not_mapping = False
+    unique_ids, llm_cols, llm_bounds = [], [], None
+    if not prompt:
+        return {
+            "llm_output": None,
+            "sql_query": "",
+            "llm_explanation": "",
+            "unique_ids": [],
+            "llm_cols": [],
+            "llm_bounds": None,
+            "not_mapping": True,
+        }
+    st.chat_message("user").write(prompt)
+    with st.chat_message("assistant"):
+        with st.spinner("Invoking query..."):
+            llm_output, sql_query, llm_explanation = run_sql_fn(prompt, llm_choice)
+            # Log (keep your exact signature)
+            logger_fn(
+                log_queries,
+                prompt,
+                sql_query,
+                llm_explanation,
+                llm_choice,
+                log_file,
+                log_bucket,
+            )
+            # No SQL generated
+            if sql_query == "":
+                st.success(llm_explanation)
+                not_mapping = True
+            else:
+                # SQL generated but no results
+                if llm_output is not None and llm_output.empty:
+                    st.warning(llm_explanation, icon="⚠️")
+                    st.caption("SQL Query:")
+                    st.code(sql_query, language="sql")
+                    st.stop()
+                # Output without mapping columns
+                elif llm_output is not None and ("landvote_id" not in llm_output.columns and "geom" not in llm_output.columns):
+                    st.write(llm_output)
+                    not_mapping = True
+                # Always show explanation + SQL in a popover
+                with st.popover("Explanation"):
+                    st.write(llm_explanation)
+                    st.caption("SQL Query:")
+                    st.code(sql_query, language="sql")
+            # Extract ids, columns, bounds if present
+            if llm_output is not None and ("landvote_id" in llm_output.columns) and (not llm_output.empty):
+                unique_ids = list(set(llm_output["landvote_id"].tolist()))
+                llm_cols = extract_columns(sql_query)
+                llm_bounds = llm_output.total_bounds.tolist()
+            else:
+                unique_ids, llm_cols, llm_bounds = [], [], None
+                not_mapping = True
+    return {
+        "llm_output": llm_output,
+        "sql_query": sql_query,
+        "llm_explanation": llm_explanation,
+        "unique_ids": unique_ids,
+        "llm_cols": llm_cols,
+        "llm_bounds": llm_bounds,
+        "not_mapping": not_mapping,
+    }
+# -----------------------------
+# Logging utils
+# -----------------------------
+minio_key = os.getenv("MINIO_KEY")
+if minio_key is None:
+    minio_key = st.secrets["MINIO_KEY"]
+minio_secret = os.getenv("MINIO_SECRET")
+if minio_secret is None:
+    minio_secret = st.secrets["MINIO_SECRET"]
+def minio_logger(consent, query, sql_query, llm_explanation, llm_choice, filename="landvote_query_log.csv", bucket="shared-tpl",
+                 key=minio_key, secret=minio_secret,
+                 endpoint="minio.carlboettiger.info"):
+    mc = minio.Minio(endpoint, key, secret)
+    mc.fget_object(bucket, filename, filename)
+    log = pd.read_csv(filename)
+    timestamp = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
+    if consent:
+        df = pd.DataFrame({"timestamp": [timestamp], "user_query": [query], "llm_sql": [sql_query], "llm_explanation": [llm_explanation], "llm_choice":[llm_choice]})
+    # if user opted out, do not store query
+    else:
+        df = pd.DataFrame({"timestamp": [timestamp], "user_query": ['USER OPTED OUT'], "llm_sql": [''], "llm_explanation": [''], "llm_choice":['']})
+    pd.concat([log,df]).to_csv(filename, index=False, header=True)
+    mc.fput_object(bucket, filename, filename, content_type="text/csv")

app/variables.py CHANGED Viewed

@@ -124,3 +124,62 @@ party_pmtiles = (
     "https://minio.carlboettiger.info/public-election/"
     "county/county_political_parties_1988-2024.pmtiles"
 )

     "https://minio.carlboettiger.info/public-election/"
     "county/county_political_parties_1988-2024.pmtiles"
 )
+from langchain_openai import ChatOpenAI
+import streamlit as st
+from langchain_openai.chat_models.base import BaseChatOpenAI
+## dockerized streamlit app wants to read from os.getenv(), otherwise use st.secrets
+import os
+api_key = os.getenv("NRP_API_KEY")
+if api_key is None:
+    api_key = st.secrets["NRP_API_KEY"]
+openrouter_api = os.getenv("OPENROUTER_API_KEY")
+if openrouter_api is None:
+    openrouter_api = st.secrets["OPENROUTER_API_KEY"]
+openrouter_endpoint="https://openrouter.ai/api/v1"
+nrp_endpoint="https://ellm.nrp-nautilus.io/v1"
+# don't use a provider that collects data
+data_policy = {
+    "provider": {
+        "data_collection": "deny"
+    }
+}
+llm_options = {
+    "devstral-2512": ChatOpenAI(
+        model="mistralai/devstral-2512:free",
+        api_key=openrouter_api,
+        base_url=openrouter_endpoint,
+        temperature=0,
+        extra_body=data_policy
+    ),
+    "trinity-mini": ChatOpenAI(
+        model="arcee-ai/trinity-mini:free",
+        api_key=openrouter_api,
+        base_url=openrouter_endpoint,
+        temperature=0,
+        extra_body=data_policy
+    ),
+    "nemotron-nano-9b-v2": ChatOpenAI(
+        model="nvidia/nemotron-nano-9b-v2:free",
+        api_key=openrouter_api,
+        base_url=openrouter_endpoint,
+        temperature=0,
+        extra_body=data_policy
+    ),
+    "gemma-3-27b-it": ChatOpenAI(
+        model="gemma3",
+        api_key=api_key,
+        base_url=nrp_endpoint,
+        temperature=0
+    ),
+}

requirements.txt CHANGED Viewed

@@ -4,7 +4,11 @@ duckdb==1.2.2
 duckdb_engine== 0.15.0
 geoarrow-types==0.2.0
 geoarrow-pandas==0.1.1
-ibis-framework[duckdb]==10.3.1
 leafmap==0.53.3
 minio==7.2.15
 maplibre==0.3.3

 duckdb_engine== 0.15.0
 geoarrow-types==0.2.0
 geoarrow-pandas==0.1.1
+ibis-framework==10.3.1
+langchain==0.2.17
+langchain-community==0.2.19
+langchain-core==0.2.43
+langchain-openai==0.1.25
 leafmap==0.53.3
 minio==7.2.15
 maplibre==0.3.3