analyst-buddy / server /app_ui.py
hjerpe's picture
Deploy analyst-buddy (Gradio app + serving)
2d58789 verified
Raw
History Blame Contribute Delete
108 kB
"""analyst-buddy — ask your data in plain English, locally (F004).
The single-column Gradio demo (the judged deliverable). It streams the data
exploration as collapsible chat steps, shows the result table, and keeps **the
query it wrote one click away from every answer — on success AND on error** (the
P0 element of the experience brief). It runs with no model by default via an
injected deterministic demo-script policy; the real model swaps in at F006 through the
``build_demo(policy_factory=...)`` seam.
This is the ONLY gradio-aware module. The streaming runner (``agent_loop``) and
the SQL machinery stay gradio-free. Keep model/RL/exploration jargon OUT of the
primary surface (Content Contract): the chat titles speak plain English.
This is the single canonical implementation module (``sql_env.server.app_ui``).
The repo-root ``app.py`` is a thin shim re-exporting ``build_demo`` so there is
exactly ONE ``Step``/class identity in the process (no dual-import footgun).
Run locally: ``uv run python app.py``.
"""
from __future__ import annotations
from collections.abc import Callable, Iterable, Iterator
import dataclasses
import html
import json
from pathlib import Path
import re
import altair as alt
import gradio as gr
import pandas as pd
try:
from ..evaluation.policies import Policy
from ..models import SQLAction, SQLObservation
from .agent_loop import Step, run_agent_turn
from .chart_intent import ChartSpec, parse_chart_intent, validate_intent
from .data_card import DataCard, apply_card_edits, propose_data_card
from .feedback import feedback_summary, record_feedback
from .ingestion import ingest_csv
from .sql_environment import SQLEnvironment
from .sql_ident import quote_ident
from . import personas
except ImportError: # pragma: no cover - flat-layout / direct-run fallback
from evaluation.policies import Policy # type: ignore[no-redef]
from models import SQLAction, SQLObservation # type: ignore[no-redef]
from server.agent_loop import Step, run_agent_turn # type: ignore[no-redef]
from server.chart_intent import ( # type: ignore[no-redef]
ChartSpec,
parse_chart_intent,
validate_intent,
)
from server.data_card import ( # type: ignore[no-redef]
DataCard,
apply_card_edits,
propose_data_card,
)
from server.feedback import ( # type: ignore[no-redef]
feedback_summary,
record_feedback,
)
from server.ingestion import ingest_csv # type: ignore[no-redef]
from server.sql_environment import SQLEnvironment # type: ignore[no-redef]
from server.sql_ident import quote_ident # type: ignore[no-redef]
from server import personas # type: ignore[no-redef]
# ---------------------------------------------------------------------------
# Demo dataset + paths (the curated retail-SMB set from F009 — Maria's pet shop).
# ---------------------------------------------------------------------------
_REPO_ROOT = Path(__file__).resolve().parents[1]
_DB_ROOT = _REPO_ROOT / "data" / "databases"
_UPLOAD_ROOT = _REPO_ROOT / "data" / "uploads"
# A small COMMITTED demo questions file (mirrors the env's questions-file schema)
# so the env constructor's gold-path validation passes WITHOUT writing into the
# user-upload dir (Q2). The demo only ever uses begin_episode/step (non-gold).
_QUESTIONS_PATH = _REPO_ROOT / "data" / "questions.json"
_DEMO_QUESTIONS_PATH = _REPO_ROOT / "data" / "demo_questions.json"
_DEMO_DB_ID = "retail_smb"
# Cap the agent's tool-call turns. Kept tight so a weak model (e.g. vanilla 0.6B)
# that can't form a valid query can't spin for long on parse-failure retries — a
# capable model answers single/2-table questions in ~3-6 steps (training used 10).
_STEP_BUDGET = 8
# Example decision-questions (framed as owner decisions, no jargon), each scripted
# by _DemoScriptPolicy below against the retail_smb dataset (Maria's pet shop).
# One click each. Question 2 intentionally shows an error → self-correction beat
# (the P0 "SQL shown on error" demo) — see _DEMO_SCRIPTS.
_EXAMPLE_QUESTIONS = [
"Which store has the most orders?",
"What was my revenue each month?",
"Which product makes the most revenue?",
]
_HEADLINE = "# Ask your data — in plain English, on your own machine"
_SUBHEAD = (
"Type a question about your data and get an answer, the result table, and "
"**the exact query it wrote** — in seconds. No SQL or spreadsheets needed."
)
_PRIVACY_CUE = (
"🔒 **Runs locally — your data stays here.** Nothing is uploaded to the cloud."
)
# Hosted-Space copy (F006/F008): on a ZeroGPU Space the model runs in the cloud, so
# the "runs locally / nothing uploaded" claim above is NOT true. Keep the honest
# differentiator — a small open model you COULD run yourself — without claiming this
# hosted demo is local. Used only when enable_model_selector is on.
_HEADLINE_HOSTED = "# Ask your data — in plain English"
_PRIVACY_CUE_HOSTED = (
"🤗 **A small open model (Qwen3) — small enough to run on your own machine.** "
"This is the hosted demo, running on Hugging Face so you can try it."
)
_EMPTY_PROVENANCE = "_Ask a question to see the answer and the query it wrote._"
# Hero (v3 design handover "Copy locked"): a segmented pill carrying the
# differentiation (fine-tuned for SQL) + the two-line headline + Maria's tagline.
# Rendered as ONE gr.HTML (static, no GPU); styling in _APP_CSS (.ab-hero/.ab-pill).
# The pill claims are CAPABILITY-framed, not deployment claims: a "100% local /
# data never leaves your laptop" badge is FALSE on the hosted ZeroGPU Space (the
# question + data are sent to the cloud to run there). "Open & tiny / small enough
# for your laptop" is true everywhere — it describes the MODEL, not where this demo
# happens to run — so both variants share it.
_TAGLINE = "A tiny AI analyst reads your tables, writes the SQL, and shows its work."
_HEADLINE_HTML = "<h1>Ask your data,<br>get answers in plain English.</h1>"
_HERO_PILL = (
'<div class="ab-pill">'
"<span>Open &amp; tiny</span>"
'<span class="mid">Qwen3-1.7B, fine-tuned for SQL</span>'
"<span>small enough for your laptop</span>"
"</div>"
)
_HERO_HTML_LOCAL = (
'<div class="ab-hero">' + _HERO_PILL + _HEADLINE_HTML + f"<p>{_TAGLINE}</p></div>"
)
_HERO_HTML_HOSTED = _HERO_HTML_LOCAL
# Judge-facing GRPO story — one click away (v3: "Why a tiny model is enough"),
# rendered as a self-contained DARK card (theme-independent) per the designer's mock:
# an amber eyebrow + light body with white emphasis.
# "What we're building" card (v4, judge-facing): names the product, states the
# HONEST work-in-progress status (incl. the v3 overfit regression), and carries the
# measured 4%→49% proof. Copy from a judged design panel; hosted-honest (the laptop
# line is a CAPABILITY — the cloud demo does NOT claim to run locally).
_BUILD_HEADER = "A pocket-sized analyst, fine-tuned"
_BUILD_WHAT = (
"We're building a tiny SQL analyst for shop owners: ask a question about your "
"business data and it reads your tables, writes the SQL, checks its own answer, "
"and shows its work. We took a small open model (Qwen3-1.7B) and fine-tuned it "
"(SFT + GRPO) for this one job. It's small enough to run on your own laptop."
)
_BUILD_PROOF = (
"On the app's sample business questions, an off-the-shelf Qwen3-1.7B gets "
"<b>~4%</b> correct. After fine-tuning the same model gets <b>~49%</b>, an "
"improvement of about 11×, and it improves on every sample dataset "
"(pet-shop retail: 0% → 42%)."
)
_BUILD_WIP = (
"Current status: one- and two-table questions are reliable, three-table joins "
"are still a weak spot, and a longer training run showed signs of overfitting "
"that we're analyzing."
)
_HEAD2HEAD_INTRO = (
"A side-by-side on the same question: the same tiny model before and after "
"fine-tuning. Off-the-shelf, it gives up or guesses wrong. Fine-tuned, it "
"explores the data and answers."
)
# The "what we're building" body (rendered inside the always-DARK card by
# ``_why_card_html`` below, optionally with the head-to-head showcase nested in).
_WHY_CARD_BODY = (
'<div class="eyebrow">What we\'re building</div>'
f"<h3>{_BUILD_HEADER}</h3>"
f"<p>{_BUILD_WHAT}</p>"
f'<p class="ab-proof">{_BUILD_PROOF}</p>'
f'<p class="ab-wip">{_BUILD_WIP}</p>'
)
# Honest roadmap strip (v3 locked — deepen the agent, never train on user data).
_ROADMAP_MD = (
"**What's next:** harder multi-table questions · more SQL dialects · the feedback "
"loop sharpening the agent's SQL skill."
)
# Single semantic accent (Visual Direction: "a single accent"). The shipped
# orange-500 (#F97316) gives white-on-accent only 2.80:1 — fails WCAG AA
# (needs >= 4.5:1). #C2410C (Tailwind orange-700) gives white-on-accent
# 5.18:1 (computed via the WCAG relative-luminance formula), comfortably AA.
# The hover shade (#9A3412, orange-800) is darker still (7.31:1), so the CTA
# stays AA-compliant in every interaction state.
_ACCENT = "#C2410C"
_ACCENT_HOVER = "#9A3412"
# Custom CSS: wrap the inline trace SQL on narrow viewports (FR5 — the SQL was
# clipping on mobile inside the chat accordion; the dedicated panel already shows
# it in full, this is polish so the inline copy wraps instead of overflowing).
# Head-to-head showcase: pre-baked base-vs-fine-tuned traces (the judge proof),
# generated offline (scripts/replay.py compare → data/showcase/…json) and rendered
# STATIC (no GPU). One hero example shown in full + a compact "same pattern" list.
_SHOWCASE_PATH = _REPO_ROOT / "data" / "showcase" / "retail_smb_base_vs_v2.json"
def _load_showcase() -> list[dict]:
"""The curated base-vs-fine-tuned examples (empty list if the file is missing)."""
try:
data = json.loads(_SHOWCASE_PATH.read_text(encoding="utf-8"))
return data if isinstance(data, list) else []
except (OSError, json.JSONDecodeError):
return []
def _trace_html(trace: list[dict]) -> str:
"""One model's tool-call trace as compact rows (action · argument · result)."""
rows = []
for s in trace:
act = html.escape(str(s.get("action", "")))
arg = html.escape(str(s.get("argument", ""))[:64])
res = html.escape(str(s.get("result", ""))[:42])
rows.append(
f'<div class="ab-step"><span class="act">{act}</span>'
f'<code>{arg}</code><span class="res">{res}</span></div>'
)
return "".join(rows)
def _showcase_html(examples: list[dict]) -> str:
"""The head-to-head contrast: one hero example in full + a compact win list."""
if not examples:
return ""
hero = examples[0]
def _col(side: dict, label: str, cls: str) -> str:
ans = html.escape(str(side.get("final_answer")))
return (
f'<div class="ab-col {cls}"><div class="ab-col-h">{label}</div>'
f'{_trace_html(side.get("trace", []))}'
f'<div class="ab-final">answered <b>{ans}</b></div></div>'
)
out = [f'<p class="ab-h2h-intro">{_HEAD2HEAD_INTRO}</p>']
out.append(
'<div class="ab-h2h-hero">'
f'<div class="ab-h2h-q">“{html.escape(hero["question"])}” '
'<span class="ab-gold">correct answer: '
f'{html.escape(str(hero["gold_answer"]))}</span></div>'
'<div class="ab-h2h-cols">'
+ _col(hero["base"], "Off-the-shelf ✗", "ab-base")
+ _col(hero["v2"], "Fine-tuned ✓", "ab-v2")
+ "</div></div>"
)
if len(examples) > 1:
items = []
for ex in examples[1:]:
base_ans = ex["base"].get("final_answer")
base_ans = "gave up" if base_ans in (None, "—", "") else str(base_ans)[:22]
v2_ans = html.escape(str(ex["v2"].get("final_answer"))[:22])
items.append(
f'<li>“{html.escape(ex["question"])}” — '
f'off-the-shelf <span class="ab-x">{html.escape(base_ans)}</span> · '
f'ours <span class="ab-ok">{v2_ans} ✓</span></li>'
)
out.append(
'<div class="ab-h2h-more"><div class="ab-col-h">'
"More questions, same pattern</div><ul>" + "".join(items) + "</ul></div>"
)
return f'<div class="ab-h2h">{"".join(out)}</div>'
def _why_card_html(examples: list[dict]) -> str:
"""The always-DARK "what we're building" card — the build copy PLUS (when present)
the base-vs-fine-tuned head-to-head NESTED inside it.
The showcase's contrast colours (white question, light intro, the win list) are
tuned for a dark surface; nesting them in the dark card is what lets them read in
BOTH light and dark themes (the card is intentionally dark in both — the same
decision as the original "why a tiny model" card). Rendered outside the card, the
light-on-light text vanished in light mode.
"""
body = _WHY_CARD_BODY
if examples:
body += _showcase_html(examples)
return f'<div class="ab-why-card">{body}</div>'
_APP_CSS = """
/* Theme-tuned accent: structural surfaces use Gradio's own theme variables (so they
flip light/dark automatically); only the rust accent is hard-coded — and bumped
BRIGHTER under .dark so it reads on navy. --ab-soft/--ab-soft-line are the warm
translucent surface + border shared by the pill, accordions, data card + feedback
bar. (Design handover: light+dark.) */
:root {
--ab-rust: #bf3e11; --ab-rust-ink: #8f3b1c;
--ab-soft: #fdf3ee; --ab-soft-line: #e7d2c7;
}
.dark {
--ab-rust: #f0863a; --ab-rust-ink: #f0a868;
--ab-soft: rgba(240, 134, 58, 0.10); --ab-soft-line: rgba(240, 134, 58, 0.28);
}
.gradio-container {
width: 100% !important;
max-width: 860px !important;
margin: 0 auto !important;
}
footer { display: none !important; }
.message-row pre, .message-row code {
white-space: pre-wrap !important;
word-break: break-word !important;
overflow-wrap: anywhere !important;
}
@media (max-width: 768px) {
.gradio-container { padding-left: 8px !important; padding-right: 8px !important; }
}
/* Subtle, theme-ADAPTIVE page background: a faint warm glow at the top, layered
(translucent) over whatever the native light/dark background is — so the content
"floats" without re-hardcoding light. Reads neat on both themes. */
body, gradio-app {
background-image: radial-gradient(
130% 55% at 50% -8%, rgba(191, 62, 17, 0.06), transparent 60%
) !important;
background-attachment: fixed !important;
}
/* 1. No double-box: neutralize the top "card" so the hero/ask/chips flow on the
page — only the answer card + accordions stay framed. gr.Group wraps its content
in an inner `.styler` that carries the slate group surface, so we transparent
BOTH (the group div + its styler) or the panel lingers behind the hero. */
.ab-top-card,
.ab-top-card > .styler {
border: none !important; background: transparent !important;
box-shadow: none !important;
}
.ab-top-card { padding: 8px 0 0 !important; }
.ab-top-card > .styler { padding: 0 !important; }
/* Animated "working" indicator — a real CSS spinner, shown while an ask runs. */
.ab-status {
display: flex; align-items: center; gap: 10px;
color: var(--ab-rust); font-weight: 500; padding: 6px 2px;
}
.ab-spinner {
width: 18px; height: 18px; flex: none;
border: 2.5px solid var(--ab-soft-line);
border-top-color: var(--ab-rust); border-radius: 50%;
animation: ab-spin 0.7s linear infinite;
}
@keyframes ab-spin { to { transform: rotate(360deg); } }
/* Theme: we do NOT force light — the app follows the user's light/dark setting.
Only RUST accents + a few translucent-warm surfaces (readable on either bg);
structural text/surfaces use Gradio's own theme variables so nothing inverts. */
/* Hero: text adapts to the theme; the pill is a warm accent that reads on both. */
.ab-hero { text-align: center; padding: 6px 0 2px; }
.ab-hero h1 {
font-size: 32px; font-weight: 700; letter-spacing: -0.025em;
color: var(--body-text-color); line-height: 1.12; margin: 2px 0 12px;
}
.ab-hero p {
font-size: 17px; color: var(--body-text-color-subdued); line-height: 1.5;
max-width: 540px; margin: 0 auto;
}
/* 2. Segmented pill — theme-tuned rust on the warm translucent surface. */
.ab-pill {
display: flex; border: 1px solid var(--ab-soft-line); background: var(--ab-soft);
border-radius: 999px; overflow: hidden; margin-bottom: 18px;
}
.ab-pill > span {
flex: 1; display: flex; align-items: center; justify-content: center;
text-align: center; padding: 10px 16px; font-size: 13px; font-weight: 600;
color: var(--ab-rust-ink); border-right: 1px solid var(--ab-soft-line);
}
.ab-pill > span:last-child { border-right: 0; }
.ab-pill > span.mid { color: var(--ab-rust); font-weight: 700; }
/* Buttons: primary = AA rust; secondary chips get a rust hover/selected accent. */
.gradio-container button.primary {
background: #C2410C !important; color: #ffffff !important;
border: none !important; border-radius: 12px !important;
}
.gradio-container button.primary:hover { background: #9A3412 !important; }
.gradio-container button.secondary:hover,
.gradio-container button.secondary:focus,
.gradio-container button.secondary:active {
border-color: var(--ab-rust) !important; color: var(--ab-rust) !important;
}
/* Component labels ("Result", the SQL block label): quiet, not a loud chip.
The SQL code label was a bright-orange filled pill — calm it to subdued text. */
.gradio-container .block-label,
.gradio-container .ab-sql .block-label,
.gradio-container .ab-sql label,
.gradio-container .ab-sql [class*="label"] {
background: var(--block-background-fill) !important;
color: var(--body-text-color-subdued) !important; font-weight: 600;
}
.gradio-container .ab-sql .block-label svg { color: var(--body-text-color-subdued); }
/* The SQL panel is one short query: "copy" (paste it into your own DB tool) is
useful, but the built-in "download" just saves a stray file.txt — hide it, keep
copy. */
.ab-sql button[title="Download"], .ab-sql a.download-link { display: none !important; }
/* 4. Accordion headers: boxed, ONE rust chevron that rotates on open, connected
panel. The warm translucent tint reads on light AND dark (theme-tuned vars). */
.gradio-container .label-wrap {
border: 1px solid var(--ab-soft-line) !important;
background: var(--ab-soft) !important; border-radius: 12px !important;
padding: 13px 18px !important; font-weight: 600 !important;
}
.gradio-container .label-wrap .icon {
color: var(--ab-rust) !important; transition: transform 0.2s ease;
transform: rotate(0deg) !important; /* closed: clean ▼ (not a sideways ◄) */
}
.gradio-container .label-wrap.open .icon { transform: rotate(180deg) !important; }
.gradio-container .label-wrap.open { border-radius: 12px 12px 0 0 !important; }
/* Kill the harsh default focus ring that rendered as a stark black/white box around
the focused accordion (the whole block, via :focus-within) — keep a warm border. */
.gradio-container .label-wrap:focus,
.gradio-container .label-wrap:focus-visible { outline: none !important; }
.gradio-container .block:has(.label-wrap) {
border-color: var(--ab-soft-line) !important;
}
.gradio-container .block:has(.label-wrap):focus-within {
box-shadow: none !important; outline: none !important;
}
.gradio-container .label-wrap + div {
border: 1px solid var(--border-color-primary); border-top: none;
border-radius: 0 0 12px 12px; padding: 14px 18px;
}
/* Ask row: a roomy input + an inline rust button. A subtle shaded fill + clearer
border + inner shadow make it obviously the place to type (it was blending into
the page). */
.ab-ask-row {
gap: 10px !important; align-items: stretch !important; margin: 4px 0 6px;
}
.ab-ask-row textarea {
border-radius: 14px !important; padding: 16px 18px !important;
font-size: 16px !important; min-height: 76px !important;
line-height: 1.4 !important;
background: var(--background-fill-secondary) !important;
border: 1px solid var(--border-color-primary) !important;
box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.05) !important;
}
.ab-ask-row textarea:focus {
border-color: var(--ab-rust) !important;
box-shadow: 0 0 0 3px rgba(240, 134, 58, 0.15) !important;
}
.ab-ask-row button.primary {
padding: 0 30px !important; font-size: 16px !important; font-weight: 700;
white-space: nowrap; min-height: 76px;
}
/* 3. Chips: real gaps + pill borders (stop the fused bar). */
.ab-starter-row {
justify-content: center !important; flex-wrap: wrap; gap: 8px !important;
margin: 6px 0 14px !important;
}
.ab-starter-row button.secondary {
border: 1px solid var(--border-color-primary) !important;
border-radius: 20px !important; background: var(--block-background-fill) !important;
}
.ab-center, .ab-center p { text-align: center !important; }
/* Persona chips: centered, roomy, individually bordered; CSV "soon" dashed. */
.ab-persona-row {
justify-content: center !important; gap: 10px !important; flex-wrap: wrap;
margin: 8px 0 !important;
}
.ab-persona-row button {
flex: 0 0 auto !important; min-width: 0 !important; width: auto !important;
padding: 11px 18px !important; font-size: 15px !important;
border-radius: 11px !important;
}
.gradio-container .ab-csv-soon button,
.gradio-container .ab-csv-soon button:hover,
.gradio-container .ab-csv-soon button:focus {
border: 1px dashed var(--ab-soft-line) !important;
background: var(--ab-soft) !important;
color: var(--body-text-color-subdued) !important;
cursor: default !important; border-radius: 11px;
}
/* Data overview ("what's in this data"): warm panel; tiles adapt to the theme. */
.ab-data-card {
background: var(--ab-soft); border: 1px solid var(--ab-soft-line);
border-radius: 12px; padding: 16px 18px;
}
.ab-data-card .ttl { font-size: 13px; font-weight: 700; color: var(--ab-rust); }
.ab-data-card .sub {
font-size: 13px; color: var(--body-text-color-subdued); font-style: italic;
margin: 2px 0 12px;
}
.ab-tiles { display: grid; grid-template-columns: repeat(3, 1fr); gap: 8px; }
.ab-tile {
background: var(--block-background-fill);
border: 1px solid var(--ab-soft-line); border-radius: 9px; padding: 10px 12px;
}
.ab-tile .nm {
font-family: "IBM Plex Mono", monospace; font-size: 12.5px;
font-weight: 700; color: var(--body-text-color);
}
.ab-tile .ct { font-size: 17px; font-weight: 700; color: var(--ab-rust); }
.ab-tile .ct small {
font-size: 10px; color: var(--body-text-color-subdued); font-weight: 500;
}
@media (max-width: 640px) { .ab-tiles { grid-template-columns: repeat(2, 1fr); } }
/* Answer card: ONE clean framed panel with breathing room. Gradio nests gray
block-fills (the card group, the plot container) that read as muddy boxes — flatten
them to a single surface, then give the chart + table a real gap (they were 1px
apart) and pad the card so nothing crowds its edges. */
.ab-answer-card {
border: 1px solid var(--ab-soft-line) !important;
border-radius: 14px !important;
background: var(--background-fill-primary) !important;
padding: 20px 22px !important; margin-top: 14px !important;
}
.ab-answer-card .gr-group,
.ab-answer-card > .styler {
background: transparent !important; border: none !important;
}
/* Chart | table: a real column gap so the result is not glued to the graph; TOP
aligned so a short chart beside a tall table doesn't leave a gray half-column. */
.ab-answer-card .row { gap: 28px !important; align-items: flex-start !important; }
.ab-answer-card .row > .block {
background: transparent !important; border: none !important;
box-shadow: none !important;
}
.ab-answer-card table { font-size: 13px !important; }
.ab-answer-card td, .ab-answer-card th { padding: 7px 12px !important; }
.ab-answer-card .ab-prov { margin-top: 14px !important; }
.ab-prov {
font-size: 12.5px; color: var(--body-text-color-subdued);
background: var(--background-fill-secondary); border-radius: 8px; padding: 8px 12px;
}
/* 5. Feedback bar: a tidy footer; buttons hug the question, the reassurance note
sits on the right (theme-tuned warm surface + border). */
.ab-feedback-bar {
background: var(--ab-soft) !important;
border: 1px solid var(--ab-soft-line) !important;
border-radius: 12px !important; padding: 14px 16px !important;
margin-top: 10px !important;
}
/* gr.Group wraps content in a `.styler` (and a nested group) that paints the slate
block surface OVER our warm fill — make those transparent so --ab-soft shows. */
.ab-feedback-bar .styler { background: transparent !important; }
.ab-feedback-bar .gr-group {
border: none !important; background: transparent !important;
}
.ab-feedback-row {
align-items: center !important; gap: 10px !important; flex-wrap: wrap;
}
/* Gradio wraps each Markdown/Button in a width:100% block, which would make the
label fill the row and wrap the buttons below (and collapse the note to 0). Force
the label + note to size to their own content so the row reads: label · 👍 · 👎 …
note(right). */
.ab-feedback-row > .ab-fb-label,
.ab-feedback-row > .ab-fb-note {
flex: 0 0 auto !important; width: auto !important; min-width: 0 !important;
}
.ab-feedback-row > .ab-fb-label p, .ab-feedback-row > .ab-fb-note p {
margin: 0 !important; white-space: nowrap;
}
.ab-feedback-row > button { flex: 0 0 auto !important; }
.ab-feedback-row > .ab-fb-note {
margin-left: auto !important; font-size: 12px !important;
color: var(--body-text-color-subdued) !important;
}
.ab-feedback-bar button.secondary {
background: var(--block-background-fill) !important;
border: 1px solid var(--ab-soft-line) !important; border-radius: 9px !important;
}
/* Hide the chatbot's per-message utility icons (copy/share/trash) in the trace. */
.ab-trace .icon-button-wrapper, .ab-trace .copy-button { display: none !important; }
/* "Why a tiny model is enough": full-width boxed accordion, aligned with
"Advanced — choose the model" (user request), opening an always-DARK card. */
#why-tiny { margin: 12px 0 0 !important; }
#why-tiny .label-wrap + div { border: none !important; padding: 10px 0 0 !important; }
.ab-why-card { background: #161b24; border-radius: 12px; padding: 18px 22px; }
.ab-why-card .eyebrow {
color: #ee9a6c; text-transform: uppercase; font-size: 12px; font-weight: 700;
letter-spacing: 0.05em; margin-bottom: 8px;
}
.ab-why-card p { color: #d6dae0; font-size: 14.5px; line-height: 1.6; margin: 0; }
.ab-why-card b { color: #ffffff; }
.ab-why-card h3 { color: #fff; font-size: 18px; margin: 2px 0 8px; font-weight: 650; }
.ab-why-card .ab-proof { color: #eef1f4; font-size: 15px; margin-top: 12px;
border-left: 3px solid #f6c453; padding-left: 12px; }
.ab-why-card .ab-wip { color: #9aa3ad; font-size: 13px; margin-top: 12px;
font-style: italic; }
/* Head-to-head: off-the-shelf vs fine-tuned (the judge proof) */
.ab-h2h { margin-top: 18px; border-top: 1px solid #2c3038; padding-top: 14px; }
.ab-h2h-intro { color: #c8ccd2; font-size: 13.5px; margin: 0 0 10px !important; }
.ab-h2h-q { color: #fff; font-weight: 600; font-size: 14.5px; margin-bottom: 8px; }
.ab-h2h-q .ab-gold { color: #7ee0a2; font-weight: 500; font-size: 13px; }
.ab-h2h-cols { display: flex; gap: 10px; flex-wrap: wrap; }
.ab-col { flex: 1 1 240px; background: #13161c; border-radius: 8px;
padding: 10px 12px; border: 1px solid #2c3038; }
.ab-col.ab-base { border-color: #5b2b30; }
.ab-col.ab-v2 { border-color: #27513a; }
.ab-col-h { font-weight: 650; font-size: 12.5px; margin-bottom: 6px; color: #e6e9ee;
text-transform: uppercase; letter-spacing: .03em; }
.ab-step { font-size: 11.5px; color: #aeb4bd; margin: 2px 0; display: flex;
gap: 6px; align-items: baseline; }
.ab-step .act { color: #8ea0b6; font-weight: 600; min-width: 62px; }
.ab-step code { color: #d6dae0; background: #0e1014; padding: 0 4px;
border-radius: 4px; font-size: 11px; }
.ab-step .res { color: #6f7884; font-style: italic; }
.ab-final { margin-top: 8px; font-size: 13px; color: #e6e9ee; }
.ab-col.ab-base .ab-final b { color: #ff8f8f; }
.ab-col.ab-v2 .ab-final b { color: #7ee0a2; }
.ab-h2h-more { margin-top: 14px; }
.ab-h2h-more ul { margin: 6px 0 0; padding-left: 18px; }
.ab-h2h-more li { color: #c8ccd2; font-size: 13px; margin: 4px 0; }
.ab-x { color: #ff8f8f; }
.ab-ok { color: #7ee0a2; font-weight: 600; }
/* comfortable touch targets for the "run live" chips (WCAG 2.5.5) */
#why-tiny .ab-starter-row button { min-height: 38px; }
.ab-footer-note { margin-top: 26px !important; }
/* SQL block: keep gradio's native code theme (readable in light + dark). */
.gradio-container .cm-editor { font-size: 13px; border-radius: 9px; }
"""
# The animated working indicator (shown in a gr.HTML while an ask streams). The CSS
# spinner above does the spinning — no static glyph.
_SPINNER_HTML = (
'<div class="ab-status" role="status" aria-live="polite">'
'<span class="ab-spinner"></span> Reading your data…</div>'
)
# ---------------------------------------------------------------------------
# Default deterministic demo-script policy (NO model). Recognizes the 3 example
# decision-questions and walks a fixed, dataset-appropriate plan against the
# committed retail_smb DB — DESCRIBE -> QUERY(s) -> ANSWER — with ONE question
# (the revenue-each-month one) scripted to hit a wrong-column error and then
# self-correct, so the no-model demo still shows the P0 "SQL on error + recovery"
# beat (Primary Journey step 4 / demo script beat 3). For an arbitrary TYPED
# (non-example) question it falls back to an HONEST, clearly-labeled placeholder
# turn (DESCRIBE a table + a simple count) — it does NOT pretend to answer
# questions it wasn't scripted for. Pure Policy (select_action only); the real
# ModelPolicy still swaps in at F006 via build_demo(policy_factory=...).
# ---------------------------------------------------------------------------
# Each scripted step is an (action_type, argument) pair. A QUERY whose SQL
# references a non-existent column produces an env error step (status="error"),
# which is exactly the self-correction beat we want to show for question 2.
_DemoStep = tuple[str, str]
# Scripts keyed by the normalized example question. Each script ends in ANSWER.
# Every ANSWER's stated number/name equals the real gold result on retail_smb
# (computed against the committed DB, not guessed). Chart blocks use bar/line only;
# x/y are column names the QUERY actually returns.
_DEMO_SCRIPTS: dict[str, list[_DemoStep]] = {
"which store has the most orders?": [
("DESCRIBE", "orders"),
(
"QUERY",
"SELECT s.name AS store, COUNT(*) AS orders "
"FROM orders o JOIN stores s ON o.store_id = s.store_id "
"GROUP BY s.name ORDER BY orders DESC",
),
(
"ANSWER",
"Maple Street Pets is your busiest store with 60 orders — well ahead "
"of Old Town Pet & Feed (49) and Harbourside Grooming (41). That is "
"where an extra weekend hire would help most.\n"
'```chart\n{"chart_type": "bar", "x": "store", "y": "orders"}\n```',
),
],
"what was my revenue each month?": [
("DESCRIBE", "orders"),
# Intentional mistake: `total` is not a column (the real one is `amount`).
# This yields a visible error step + the failed SQL (P0), then the next
# step recovers with `amount` AND `WHERE status = 1` (refunds excluded) —
# the demo's self-correction beat plus the refund-aware revenue beat.
(
"QUERY",
"SELECT strftime('%Y-%m', order_date) AS month, SUM(total) AS revenue "
"FROM orders WHERE status = 1 GROUP BY month ORDER BY month",
),
(
"QUERY",
"SELECT strftime('%Y-%m', order_date) AS month, "
"ROUND(SUM(amount), 2) AS revenue "
"FROM orders WHERE status = 1 GROUP BY month ORDER BY month",
),
(
"ANSWER",
"Paid-order revenue ran from about £166 in your quietest month "
"(September) up to roughly £828 in your best (June), starting the year "
"at £558.91 in January and finishing at £583.47 in December — about "
"£6,552 across the year (refunds excluded).\n"
'```chart\n{"chart_type": "line", "x": "month", "y": "revenue"}\n```',
),
],
"which product makes the most revenue?": [
("DESCRIBE", "order_items"),
(
"QUERY",
"SELECT p.name AS product, ROUND(SUM(oi.line_amount), 2) AS revenue "
"FROM order_items oi JOIN products p ON oi.product_id = p.product_id "
"GROUP BY p.name ORDER BY revenue DESC",
),
(
"ANSWER",
"Grain-Free Dog Food 5kg is your top earner at £1,168.50, ahead of the "
"Reflective Lead (£910.00) and the Fleece Pet Blanket (£819.59). Keep "
"it well stocked.\n"
'```chart\n{"chart_type": "bar", "x": "product", "y": "revenue"}\n```',
),
],
}
_GENERIC_FALLBACK_ANSWER = "See the result table above."
def _any_query_succeeded(observation: SQLObservation) -> bool:
"""True if any prior QUERY in this episode ran without error.
Reads ``observation.action_history`` (the env logs each step as e.g.
``"QUERY -> ERROR: ..."`` on failure or ``"QUERY -> <result preview>"`` on
success). A scripted plan run against the wrong dataset produces only
``QUERY -> ERROR`` lines; the retail happy path always lands at least one
successful QUERY (the recovery query on the revenue script). This lets the
policy refuse to emit a scripted ANSWER unless the plan actually executed on
the active DB.
"""
for entry in observation.action_history:
text = entry.strip()
if text.startswith("QUERY ->") and "QUERY -> ERROR" not in text:
return True
return False
class _DemoScriptPolicy:
"""Deterministic, model-free policy driven by a per-question script.
Reads ``observation.question`` (the env surfaces the user's question there)
and, when it matches one of the 3 example decision-questions AND the ACTIVE
dataset IS the retail demo DB, walks that question's scripted
``DESCRIBE -> QUERY(s) -> ANSWER`` plan against the demo dataset. The
revenue script includes a wrong-column QUERY so the env returns an error
step (the failed SQL is then shown), followed by a recovery QUERY — the
visible self-correction beat. For any other (typed) question — OR an example
question asked against a DIFFERENT (e.g. uploaded) dataset — it falls back to
an honest, labeled placeholder turn rather than fabricating a retail answer.
Deterministic and instant; no torch/transformers. The real ``ModelPolicy``
swaps in at F006.
The scripted DESCRIBE/QUERY/ANSWER plans hard-code retail table and column
names, so they only make sense on the retail demo DB. ``active_db_id`` carries
which dataset the current ask runs against (``_handle_ask`` sets it from
``db_state["db_id"]``); when it does not equal ``demo_db_id`` the scripted
plan is never used. As belt-and-suspenders (and to cover any future db_id
that happens to match by name but not by schema), a scripted ANSWER is only
emitted once at least one QUERY has actually SUCCEEDED on the active DB; if
every scripted QUERY errored, the honest fallback answer is returned instead.
"""
def __init__(
self,
*,
demo_db_id: str = _DEMO_DB_ID,
active_db_id: str = _DEMO_DB_ID,
) -> None:
self._plan_index = 0
self._demo_db_id = demo_db_id
# Which dataset the current ask runs against. Defaults to the demo DB so
# the policy scripts the retail examples out of the box (the shipped
# default + every direct-construction test). ``_handle_ask`` overrides
# this per-ask from ``db_state["db_id"]`` so an example chip clicked over
# an uploaded dataset falls through to the honest path.
self.active_db_id = active_db_id
def select_action(self, observation: SQLObservation) -> SQLAction:
plan = self._plan_for(observation)
# Answer once budget is nearly spent regardless of plan position, so a
# script never runs the episode out of steps without a final answer.
if observation.budget_remaining <= 1:
return SQLAction(
action_type="ANSWER", argument=self._fallback_answer(observation, plan)
)
step = self._plan_index
self._plan_index += 1
if step < len(plan):
action_type, argument = plan[step]
if action_type == "ANSWER":
# Only emit a scripted ANSWER if a QUERY actually succeeded on the
# active DB — otherwise the plan ran against the wrong dataset and
# the scripted literal would contradict the empty table.
return SQLAction(
action_type="ANSWER",
argument=self._answer_for(observation, argument),
)
return SQLAction(action_type=action_type, argument=argument)
return SQLAction(
action_type="ANSWER", argument=self._fallback_answer(observation, plan)
)
def _plan_for(self, observation: SQLObservation) -> list[_DemoStep]:
"""The scripted plan for this question, or an honest fallback plan.
Scripts are retail-schema-specific, so they are ONLY used when the active
dataset is the retail demo DB; on any other dataset (an uploaded CSV) the
same example question falls through to the honest placeholder plan.
"""
if self.active_db_id != self._demo_db_id:
return self._fallback_plan(observation)
key = observation.question.strip().lower()
scripted = _DEMO_SCRIPTS.get(key)
if scripted is not None:
return scripted
return self._fallback_plan(observation)
def _answer_for(self, observation: SQLObservation, scripted_answer: str) -> str:
"""The scripted ANSWER, but only if a QUERY actually succeeded.
Belt-and-suspenders against a scripted plan whose QUERYs all errored on
the active DB (the script's tables/columns don't exist there): emitting
the scripted literal then would contradict the empty result table. When no
QUERY succeeded, return the honest generic note instead.
"""
if _any_query_succeeded(observation):
return scripted_answer
return _GENERIC_FALLBACK_ANSWER
def _fallback_plan(self, observation: SQLObservation) -> list[_DemoStep]:
"""Honest placeholder turn for a non-example typed question.
Looks at the first available table and counts its rows, then answers with
a LABELED disclaimer — this is a no-model stub, so it never pretends to
have answered an arbitrary question.
"""
table = self._first_table(observation.schema_info)
return [
("DESCRIBE", table),
("QUERY", f"SELECT COUNT(*) AS n FROM {quote_ident(table)}"),
(
"ANSWER",
"Demo mode (no model loaded): I can't answer free-form questions "
f"here yet — but `{table}` has the row count shown above. Try one "
"of the example questions to see a full answer.",
),
]
@staticmethod
def _first_table(schema_info: str) -> str:
for line in schema_info.splitlines():
stripped = line.strip()
if not stripped.startswith("- "):
continue
candidate = stripped[2:]
if ":" in candidate:
candidate = candidate.split(":", maxsplit=1)[0]
candidate = candidate.strip()
if candidate:
return candidate
return "data"
@staticmethod
def _fallback_answer(observation: SQLObservation, plan: list[_DemoStep]) -> str:
"""The plan's final ANSWER argument, or a generic 'see the table' note.
Budget/overrun path. A scripted ANSWER literal is only safe to surface
here once a QUERY actually SUCCEEDED on the active DB (FIX 1b: otherwise
the retail literal could leak via budget exhaustion when the script ran
against the wrong dataset). When no QUERY succeeded, fall back to the
honest generic note rather than the scripted prose.
"""
if _any_query_succeeded(observation):
for action_type, argument in reversed(plan):
if action_type == "ANSWER":
return argument
return observation.result.strip() or _GENERIC_FALLBACK_ANSWER
def _default_policy_factory() -> Policy:
"""The default policy factory: a fresh deterministic demo-script policy."""
return _DemoScriptPolicy()
# ---------------------------------------------------------------------------
# Chat-history normalization — gradio 6.x preprocesses prior turns to list[dict]
# (the messages Chatbot), but this handler appends gr.ChatMessage objects. To
# keep turn 2+ internally consistent (C3) we coerce the WHOLE incoming history to
# a uniform list[gr.ChatMessage] before appending, so the list never mixes types.
# ---------------------------------------------------------------------------
def _coerce_history(chat_history: list) -> list[gr.ChatMessage]:
"""Normalize a Chatbot history to a uniform ``list[gr.ChatMessage]``.
Gradio 6.x preprocesses prior turns into ``list[dict]`` (``{"role","content",
"metadata"}``) while this handler appends ``gr.ChatMessage`` objects; without
normalization, turn 2 would carry a mixed ``[dict, ..., ChatMessage]`` list
that only renders by serializer tolerance. Coercing every entry to
``gr.ChatMessage`` keeps the list one type end to end (C3).
"""
normalized: list[gr.ChatMessage] = []
for entry in chat_history:
if isinstance(entry, gr.ChatMessage):
normalized.append(entry)
continue
if isinstance(entry, dict):
normalized.append(
gr.ChatMessage(
role=str(entry.get("role", "assistant")),
content=entry.get("content", ""),
metadata=dict(entry.get("metadata") or {}),
)
)
continue
# Unknown shape — render it as plain assistant text rather than drop it.
normalized.append(gr.ChatMessage(role="assistant", content=str(entry)))
return normalized
# Terminal filler messages the agent loop appends after the real answer — skipped
# when extracting "the model's answer" for the feedback record.
_FILLER_ANSWERS = {"task complete.", "done.", "ended session.", ""}
def _content_text(content: object) -> str:
"""Flatten a gradio message ``content`` (str, or 6.x list-of-parts) to text.
Gradio 6.x can deserialize a message's ``content`` as a LIST of content parts
(e.g. ``[{"type": "text", "text": "..."}]``) rather than a plain string, so
calling string ops on it directly raised ``'list' object has no attribute
'strip'``. This collapses any shape to a plain string.
"""
if isinstance(content, str):
return content
if isinstance(content, list):
parts: list[str] = []
for part in content:
if isinstance(part, str):
parts.append(part)
elif isinstance(part, dict):
parts.append(str(part.get("text") or part.get("value") or ""))
else:
parts.append(str(getattr(part, "text", "") or ""))
return "".join(parts)
return "" if content is None else str(content)
def _last_answer_text(history: list) -> str:
"""The model's final answer text from a chat history (for the feedback record).
Returns the last substantive assistant message (skipping terminal filler like
'Task complete.'), with surrounding markdown bold stripped. Best-effort context
for the training candidate — never raises.
"""
for msg in reversed(_coerce_history(history)):
if getattr(msg, "role", None) != "assistant":
continue
content = _content_text(getattr(msg, "content", "")).strip()
if content.lower() not in _FILLER_ANSWERS:
return content.strip("*").strip()
return ""
# ---------------------------------------------------------------------------
# Step -> chat message mapper (the only gradio-coupling of the loop; pure).
# ---------------------------------------------------------------------------
# Plain-language titles per action type — NO model/RL/agent jargon (Content
# Contract). DESCRIBE/SAMPLE speak about looking at the data, not "exploring".
_DONE_TITLES = {
"DESCRIBE": "Looked at the data",
"SAMPLE": "Looked at a few rows",
"QUERY": "Ran a query",
"ANSWER": "Answer",
}
def _step_to_chat_message(
step: Step,
*,
step_number: int | None = None,
pending: bool = False,
will_retry: bool = True,
) -> gr.ChatMessage:
"""Map ONE ``Step`` to ONE ``gr.ChatMessage`` accordion (pure on its inputs).
Produces ``role="assistant"`` and a plain-language ``metadata["title"]`` (no
model/RL jargon), prefixed with the 1-based ``step_number`` ("1 · …") so a long
trace is easy to follow. The title names the table for DESCRIBE/SAMPLE so repeated
look-ups are distinguishable, and the content shows WHAT the step returned (so the
user can inspect it). ``metadata["status"]`` is ``"pending"`` when ``pending``,
else ``"done"`` (with ``metadata["duration"]``). The error title only says
"retrying" when a retry actually follows (``will_retry``); else neutral "Query
failed" (N3).
"""
if step.status == "error":
base = "Query failed — retrying" if will_retry else "Query failed"
content = _error_content(step)
else:
base = _done_title(step)
content = _done_content(step)
title = f"{step_number} · {base}" if step_number else base
metadata: dict[str, object] = {"title": title}
if pending:
metadata["status"] = "pending"
else:
metadata["status"] = "done"
metadata["duration"] = round(step.duration_s, 2)
return gr.ChatMessage(role="assistant", content=content, metadata=metadata)
_RESULT_PREVIEW_CHARS = 1200
def _truncate(text: str, limit: int = _RESULT_PREVIEW_CHARS) -> str:
"""Cap long result text for the inline step preview."""
text = (text or "").strip()
if len(text) <= limit:
return text
return text[:limit].rstrip() + "\n… (truncated)"
def _done_title(step: Step) -> str:
"""Informative collapsed-trace title — names the table so repeated DESCRIBE
steps are distinguishable ("Looked at `orders`", not just "Looked at the data")."""
if step.action_type == "DESCRIBE":
return f"Looked at `{step.argument}`"
if step.action_type == "SAMPLE":
return f"Sampled `{step.argument}`"
return _DONE_TITLES.get(step.action_type, "Step")
def _done_content(step: Step) -> str:
"""Human-readable detail for a successful step — including WHAT it returned."""
if step.action_type == "QUERY":
body = f"The query it ran:\n```sql\n{step.argument}\n```"
if step.result_text:
body += f"\n\nWhat came back:\n```\n{_truncate(step.result_text)}\n```"
return body
if step.action_type == "ANSWER":
return f"**{step.argument}**"
if step.action_type in ("DESCRIBE", "SAMPLE"):
body = f"Looked at `{step.argument}`."
if step.result_text:
body += f"\n\n```\n{_truncate(step.result_text)}\n```"
return body
return step.result_text or ""
def _error_content(step: Step) -> str:
"""Human-readable detail for a failed step — always shows the failed SQL."""
detail = step.error or "That query didn't work."
return f"{detail}\n\nThe query it tried:\n```sql\n{step.argument}\n```"
# ---------------------------------------------------------------------------
# Provenance ("show your work") — one quiet PLAIN-LANGUAGE line under the answer.
# It paraphrases the table/filter of the final SELECT rather than repeating the
# raw SQL (which already lives in the Code panel) — Q1.
# ---------------------------------------------------------------------------
_FROM_PATTERN = re.compile(
r"\bFROM\s+([`\"\[]?[A-Za-z_][A-Za-z0-9_]*[`\"\]]?)", re.IGNORECASE
)
_WHERE_PATTERN = re.compile(
r"\bWHERE\s+(.+?)(?:\bGROUP\s+BY\b|\bORDER\s+BY\b|\bLIMIT\b|;|$)",
re.IGNORECASE | re.DOTALL,
)
def _strip_identifier_quotes(identifier: str) -> str:
"""Strip surrounding SQL identifier quotes (`` ` ``, ``"``, ``[]``)."""
cleaned = identifier.strip()
if len(cleaned) >= 2 and cleaned[0] in '`"[' and cleaned[-1] in '`"]':
return cleaned[1:-1]
return cleaned
def _provenance_line(sql: str, *, row_count: int | None = None) -> str:
"""A quiet one-line PLAIN-LANGUAGE 'show your work' note (Q1).
Paraphrases the primary table and WHERE filter of the final SELECT (e.g.
"Looked at `orders`, filtered where `status = 1`, returned 4 rows") instead of
repeating the raw SQL — that already lives in the Code panel. Falls back to a
generic line when the SQL can't be parsed.
"""
stripped = sql.strip()
if not stripped:
return _EMPTY_PROVENANCE
from_match = _FROM_PATTERN.search(stripped)
if from_match is None:
return "_Show your work: ran a read-only query on your data._"
table = _strip_identifier_quotes(from_match.group(1))
parts = [f"looked at `{table}`"]
where_match = _WHERE_PATTERN.search(stripped)
if where_match is not None:
condition = " ".join(where_match.group(1).split())
if condition:
parts.append(f"filtered where `{condition}`")
if row_count is not None:
plural = "row" if row_count == 1 else "rows"
parts.append(f"returned {row_count} {plural}")
body = ", ".join(parts)
return f"_Show your work: {body} (read-only)._"
# ---------------------------------------------------------------------------
# _handle_ask — the streaming generator behind 'Ask your data' / example chips.
# ---------------------------------------------------------------------------
def _empty_table() -> dict:
"""An empty gr.Dataframe payload (no rows, no headers)."""
return {"headers": [], "data": []}
def _table_payload(rows: list[tuple], columns: list[str]) -> dict:
"""A gr.Dataframe payload from re-executed rows + real headers."""
return {"headers": list(columns), "data": [list(row) for row in rows]}
# Chart styling — the app UI is dark, so a default white vega-lite chart looks out
# of place. We render on a TRANSPARENT background with light axis/label/title
# colours and a single indigo accent for the bars/line/markers (Visual Direction:
# "a single accent"). Many-category x axes stay readable via vega-lite's own
# label handling: a -45° label angle plus a labelLimit that truncates long labels
# with a client-side ellipsis — no manual thinning/truncation needed.
# Warm-light chart palette (v3 design): rust marks on a white card with quiet axes.
# Best practice for "which is biggest?" charts: highlight the focal (max) bar in a
# saturated colour and mute the rest, so the eye lands on the answer (the designer's
# two-tone rust). The exact answer values live in the table beside the chart.
_CHART_ACCENT = "#bf3e11" # rust — the leader bar / line / point
_CHART_ACCENT_MUTED = "#eaa97f" # light rust — the runner-up bars
_CHART_FG = "#8a93a0" # mid gray — legible on both light and dark surfaces
_CHART_GRID = "#eceef1" # light grid lines
_CHART_DOMAIN = "#d1d5db" # gray-300 (axis domain line)
_CHART_LABEL_LIMIT = 90 # px budget before vega-lite ellipsis-truncates a label
# Many-category bar charts (e.g. 20 products) render as a forest of unreadable
# slivers. Cap the BAR chart to the top-N by value so the eye lands on the leaders;
# the table beside it still carries every row. 6 dovetails with the value-label
# threshold below, so the capped bars also get their numbers.
_CHART_MAX_BARS = 6
# Bar x-axis labels (category names like "Grain-Free Dog Food 5kg") don't fit under a
# narrow bar, so abbreviate the DISPLAYED tick to its first N chars (digits too) with a
# trailing ellipsis, angled for density — the FULL label stays on hover (tooltip) and in
# the table. Abbreviating only the tick (via vega labelExpr), NOT the x field, keeps two
# names that share a prefix as distinct bars.
_CHART_LABEL_ABBR = 4
def _render_chart(
spec: ChartSpec | None,
rows: list[tuple],
columns: list[str],
) -> "alt.Chart | None":
"""Render a chart ``ChartSpec`` to an interactive Altair (vega-lite) chart.
Returns ``None`` — meaning "show the table only, no chart" — when ``spec`` is
None, ``rows`` is empty, or the resolved ``spec.y`` column can't be coerced to
numeric (a chart of a non-numeric y axis is degenerate; the column-presence
validation already happened upstream in ``validate_intent`` — dtype is enforced
only here, at render time, where the DataFrame exists).
Otherwise it builds an Altair chart for ``spec.chart_type`` (bar/line/scatter)
with a per-point ``tooltip`` encoding — that hover interactivity is the whole
point of using vega-lite over a static image. The x axis is treated as
nominal/ordinal so string x values (product names, "2024-01" months) plot
correctly; a ``-45°`` label angle plus a ``labelLimit`` keep many-category axes
readable (vega-lite truncates long labels with a client-side ellipsis — no
manual thinning needed). Styled for the dark app UI: a transparent background
with light axis colours and a single indigo accent (or ``spec.color`` when set).
Rendering through ONE ``gr.Plot`` (fed this single chart) — rather than the
3 toggled vega ``gr.*Plot`` of the original seam — is what fixes the multi-turn
crash: hiding a previously-visible plot on a type switch threw a
``ResizeObserver`` ``offsetWidth`` error and aborted the new chart. A single
component is never hidden-then-reshown mid-stream, so hover is restored WITHOUT
reintroducing the disappear bug.
"""
if spec is None or not rows:
return None
df = pd.DataFrame([list(row) for row in rows], columns=list(columns))
# Hide rather than draw a degenerate chart when y can't be plotted numerically.
if pd.to_numeric(df[spec.y], errors="coerce").isna().all():
return None
use_color = spec.color is not None and spec.color in columns
tooltip = [spec.x, spec.y] + ([spec.color] if use_color else [])
# Minimal axes: horizontal category labels (angled ones read as clutter), long
# names ellipsis-truncate and keep full text in the tooltip; no ticks/domain.
x_axis = alt.Axis(
labelAngle=0, labelLimit=_CHART_LABEL_LIMIT, labelPadding=8,
ticks=False, domain=False,
)
if spec.chart_type == "line":
x = alt.X(f"{spec.x}:N", title=None, axis=x_axis)
# Lines need a y scale to read; keep it quiet (no domain/ticks, faint grid).
y = alt.Y(
f"{spec.y}:Q",
title=None,
axis=alt.Axis(grid=True, ticks=False, domain=False, labelPadding=6),
)
enc: dict = {"x": x, "y": y, "tooltip": tooltip}
if use_color:
enc["color"] = alt.Color(spec.color)
chart = alt.Chart(df).mark_line(
color=_CHART_ACCENT,
strokeWidth=2.5,
point=alt.OverlayMarkDef(color=_CHART_ACCENT, size=55),
).encode(**enc)
elif spec.chart_type == "scatter":
x = alt.X(f"{spec.x}:N", sort="-y", title=None, axis=x_axis)
y = alt.Y(f"{spec.y}:Q", title=None, axis=alt.Axis(ticks=False, domain=False))
enc = {"x": x, "y": y, "tooltip": tooltip}
if use_color:
enc["color"] = alt.Color(spec.color)
chart = alt.Chart(df).mark_point(
filled=True, color=_CHART_ACCENT, size=90
).encode(**enc)
else: # bar — minimal: no axes/grid, value labels on bars, the leader in rust
# Cap to the top-N categories by value (the table still shows every row), so a
# 20-category result is a clean ranking of leaders, not a forest of slivers.
if len(df) > _CHART_MAX_BARS:
order = pd.to_numeric(df[spec.y], errors="coerce")
df = (
df.assign(_ab_order=order)
.sort_values("_ab_order", ascending=False)
.drop(columns="_ab_order")
.head(_CHART_MAX_BARS)
)
# Angled, abbreviated tick labels (first N chars + "…"); full name on hover.
n = _CHART_LABEL_ABBR
bar_x_axis = alt.Axis(
labelAngle=-40, labelPadding=6, ticks=False, domain=False,
labelExpr=(
f"length(datum.label) > {n} ? slice(datum.label, 0, {n}) + '…' "
": datum.label"
),
)
x = alt.X(f"{spec.x}:N", sort="-y", title=None, axis=bar_x_axis)
y = alt.Y(f"{spec.y}:Q", title=None, axis=None) # values shown on the bars
if use_color:
color = alt.Color(spec.color)
else:
# Two-tone highlight: the max bar saturated, the rest muted (best practice).
max_val = float(pd.to_numeric(df[spec.y], errors="coerce").max())
color = alt.condition(
alt.datum[spec.y] >= max_val,
alt.value(_CHART_ACCENT),
alt.value(_CHART_ACCENT_MUTED),
)
base = alt.Chart(df).encode(x=x, tooltip=tooltip)
bars = base.mark_bar(
cornerRadiusTopLeft=5, cornerRadiusTopRight=5
).encode(y=y, color=color)
# Value labels above each bar make the insight readable without the table —
# but ONLY for a few bars; with many categories they overlap into mush (the
# table carries the exact numbers then).
if len(df) <= _CHART_MAX_BARS:
labels = base.mark_text(
dy=-7, fontSize=12, fontWeight="bold", color="#8a93a0"
).encode(y=y, text=alt.Text(f"{spec.y}:Q", format="~s"))
chart = bars + labels
else:
chart = bars
chart = (
# Compact + fixed: gradio's gr.Plot container does not propagate a resolved
# width to vega (width="container" collapses to a sliver), so size it to fit
# the answer card's chart column with room to spare.
chart.properties(height=170, width=300)
.configure(background="transparent", font="IBM Plex Sans, system-ui")
.configure_axis(
labelColor=_CHART_FG,
titleColor=_CHART_FG,
gridColor=_CHART_GRID,
domainColor=_CHART_DOMAIN,
labelFontSize=12,
)
.configure_view(stroke=None)
)
return chart
def _chart_update(
spec: ChartSpec | None,
rows: list[tuple],
columns: list[str],
) -> dict:
"""The single ``gr.update()`` for the one ``chart_plot`` ``gr.Plot``.
Renders the spec to an interactive Altair chart via ``_render_chart``: a chart
→ ``gr.update(visible=True, value=chart)``; ``None`` (no spec / empty rows /
non-numeric y) → ``gr.update(visible=False, value=None)`` (table only — no
degenerate empty chart).
"""
chart = _render_chart(spec, rows, columns)
if chart is None:
return _hidden_chart()
return gr.update(visible=True, value=chart)
def _hidden_chart() -> dict:
"""The single hidden ``chart_plot`` update (pre-ANSWER / no-valid-intent)."""
return gr.update(visible=False, value=None)
def _apply_step(
step: Step,
history: list[gr.ChatMessage],
table_value: dict,
sql_value: str,
last_sql: str,
last_row_count: int | None,
answered: bool,
*,
will_retry: bool,
step_number: int | None = None,
) -> tuple[list[gr.ChatMessage], dict, str, str, int | None, bool]:
"""Fold ONE ``Step`` into the streaming UI state (pure on its inputs).
Appends the step's chat message and, for a QUERY, pipes the SQL to the panel
and (on success) refreshes the result table from the LAST successful QUERY —
regardless of row count (C1/C2). ``will_retry`` chooses the error title (N3);
``step_number`` numbers the trace entry. Returns the updated ``(history, table,
sql, last_sql, last_row_count, answered)`` tuple; never mutates its inputs.
"""
history = [
*history,
_step_to_chat_message(step, step_number=step_number, will_retry=will_retry),
]
if step.action_type == "QUERY":
# P0: the query it wrote reaches the dedicated panel on success AND error.
sql_value = step.argument
last_sql = step.argument
if step.status == "done":
# C1/C2: update on EVERY successful QUERY, even a 0-row result. An
# empty result renders real headers + zero data rows (honest).
table_value = _table_payload(step.rows, step.columns)
last_row_count = len(step.rows)
if step.action_type == "ANSWER" and step.status == "done":
answered = True
return history, table_value, sql_value, last_sql, last_row_count, answered
def _handle_ask(
question: str,
db_state: dict,
chat_history: list,
policy_factory: Callable[[], Policy],
*,
rollout: Callable[[], Iterable[Step]] | None = None,
) -> Iterator[tuple]:
"""Generator handler for 'Ask your data' / an example chip.
Seeds a NON-GOLD episode (``env.begin_episode``), iterates
``run_agent_turn``, and after EVERY step yields the growing UI tuple
``(chat_history, table_value, sql_code_value, provenance_text)`` so the chat
accordions stream and the table/SQL appear as soon as available.
P0: pipes the QUERY argument to ``sql_code_value`` on success AND whenever a
step has ``status="error"`` (the failed query is shown before the retry).
Append-only (ADR 0006): ``chat_history`` only grows. The incoming history is
normalized to a uniform ``list[gr.ChatMessage]`` (C3) so multi-turn "Ask
another" never mixes dicts and ChatMessage objects.
The result table tracks the LAST successful QUERY regardless of row count
(C1/C2): a zero-row final QUERY renders real headers + an empty body, so the
shown SQL and the table never disagree.
F005: on the ANSWER step, ``parse_chart_intent`` strips an optional
``​```chart {…}```​`` block from the answer text (so the user sees clean
prose) and ``validate_intent`` checks it against the LAST successful QUERY's
columns; a valid intent renders an interactive Altair (vega-lite) chart into
the single ``chart_plot`` ``gr.Plot`` fed that QUERY's DataFrame
(``_render_chart``). Absent/invalid/non-numeric-y intent → the single plot
hidden (table only — the F004 behaviour preserved). Rendering through ONE
``gr.Plot`` (not the 3 toggled vega ``gr.*Plot``) fixes the multi-turn crash
where hiding a previously-visible vega plot on a type switch threw a
``ResizeObserver`` ``offsetWidth`` error and made the new chart disappear — a
single component is never hidden-then-reshown mid-stream. Using Altair keeps
per-point hover tooltips (the bug was the multiple toggled components, not vega
itself), and a ``labelAngle`` + ``labelLimit`` keep many-category axes readable.
Every yield is a 5-tuple
``(chatbot, result_table, sql_code, provenance, chart_plot)``.
"""
history = _coerce_history(chat_history)
table_value = _empty_table()
sql_value = ""
last_sql = ""
last_row_count: int | None = None
# F005: the LAST successful QUERY's rows/columns feed the chart + validate the
# intent's x/y against the real result columns.
last_rows: list[tuple] = []
last_columns: list[str] = []
answered = False
chart = _hidden_chart()
step_no = 0 # 1-based step counter for the numbered trace titles ("1 ·", "2 ·").
if not question.strip():
yield (history, table_value, sql_value, _EMPTY_PROVENANCE, chart)
return
if rollout is not None:
# Real-model path (F006): the whole agentic loop already ran inside ONE
# window — a ``@spaces.GPU`` call on a ZeroGPU Space (one GPU allocation
# per question; ZeroGPU has no cross-call KV cache). The env + ModelPolicy
# are built INSIDE that window from primitives (db_dir/db_id/question) so no
# live SQLite connection crosses the fork, and the resolved ``Step`` list is
# returned. We stream those steps through the SAME UI mapping below — format
# parity is guaranteed because the rollout drives ``ModelPolicy`` (which goes
# through ``server/tooling.py``), never a second parser.
steps_iter: Iterable[Step] = rollout()
else:
env = _build_environment(db_state["db_dir"])
env.begin_episode(db_state["db_id"], question)
policy = policy_factory()
# Thread the ACTIVE dataset to the demo-script policy so it only walks the
# retail scripts when the retail demo DB is loaded — an example chip clicked
# over an UPLOADED dataset must NOT fabricate the retail answer (it falls
# through to the honest "Demo mode" placeholder). Other (real-model) policies
# ignore this; the attribute set is a no-op for them.
if isinstance(policy, _DemoScriptPolicy):
policy.active_db_id = db_state["db_id"]
steps_iter = run_agent_turn(question, env, policy, budget=_STEP_BUDGET)
def _process(step: Step, *, will_retry: bool) -> dict:
"""Fold one step into the closed-over UI state; return the chart update.
Keeps the ANSWER block-strip + chart render in ONE place (the only chart
parse site) and mutates the enclosing ``_handle_ask`` locals so both the
mid-loop and final-step call sites stay thin.
"""
nonlocal history, table_value, sql_value, last_sql, last_row_count
nonlocal last_rows, last_columns, answered, step_no
step_no += 1
# Track the last successful QUERY's rows/columns for the chart + validator.
if step.action_type == "QUERY" and step.status == "done":
last_rows = list(step.rows)
last_columns = list(step.columns)
# ANSWER: strip the chart block so the chat shows clean prose; validate the
# intent against the last successful QUERY columns.
step_chart = _hidden_chart()
if step.action_type == "ANSWER" and step.status == "done":
clean_text, intent = parse_chart_intent(step.argument)
step = dataclasses.replace(step, argument=clean_text)
spec = validate_intent(intent, last_columns) if intent is not None else None
step_chart = _chart_update(spec, last_rows, last_columns)
history, table_value, sql_value, last_sql, last_row_count, answered = (
_apply_step(
step,
history,
table_value,
sql_value,
last_sql,
last_row_count,
answered,
will_retry=will_retry,
step_number=step_no,
)
)
return step_chart
# One-step lookahead: an error step only says "retrying" when another step
# actually follows it in this turn (N3). We hold the current step until we
# know whether the turn continues, then render it with the correct title.
pending_step: Step | None = None
for next_step in steps_iter:
if pending_step is not None:
chart = _process(pending_step, will_retry=True)
yield (
history,
table_value,
sql_value,
_provenance_line(sql_value, row_count=last_row_count),
chart,
)
pending_step = next_step
if pending_step is not None:
# The final step of the turn: no step follows, so no "retrying" suffix.
chart = _process(pending_step, will_retry=False)
yield (
history,
table_value,
sql_value,
_provenance_line(sql_value, row_count=last_row_count),
chart,
)
if not answered:
# Budget exhausted / no ANSWER — be honest, keep the last SQL it tried.
# No ANSWER → no intent → the chart stays hidden.
history = [
*history,
gr.ChatMessage(
role="assistant",
content=(
"I couldn't finish that one — here's the last query I tried "
"(have a look, or rephrase the question)."
),
metadata={"title": "Couldn't finish", "status": "done"},
),
]
yield (
history,
table_value,
last_sql,
_provenance_line(last_sql, row_count=last_row_count),
_hidden_chart(),
)
# ---------------------------------------------------------------------------
# _handle_upload / _handle_accept_card — CSV upload -> editable assumptions card.
# The card is DISPLAY-ONLY (ADR 0007): it is shown + confirmed, never injected
# into get_system_prompt at F004. render_data_context is the deferred F006 seam.
# ---------------------------------------------------------------------------
def _render_card_markdown(card: DataCard, *, editable: bool = True) -> str:
"""Render a confirmed/proposed ``DataCard`` as a plain-language summary.
DISPLAY-ONLY: this is the 'here's what I think your data is' orientation beat,
NOT the prompt-injection path (``render_data_context`` stays deferred to F006).
``editable=False`` drops the "edit, then Accept" line for the read-only demo
accordion (which has no Accept control); the upload flow keeps it (default).
"""
lines = [
f"**Here's what's in your data** ({card.row_count} rows in `{card.table}`):",
]
if card.table_description:
lines.append(f"- {card.table_description}")
for column in card.columns:
detail = column.description or column.profile.duckdb_type
lines.append(f"- `{column.profile.safe_name}` — {detail}")
lines.append("")
if editable:
lines.append("_These are best-guess assumptions you can edit, then Accept._")
else:
lines.append("_Best-guess column descriptions (read-only)._")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Preloaded-dataset orientation card (FR3 — Primary Journey step 2). On app load
# the demo dataset is named + oriented on screen BEFORE the user asks: its name,
# a compact rows/columns profile, and the "here's what I think your data is"
# assumptions. Reuses the F003 profiler (`propose_data_card`) and is
# DISPLAY-ONLY — it is NOT injected into `get_system_prompt` (ADR 0007; the
# trained policy never saw a data-context block — that wiring is the F006 seam).
# ---------------------------------------------------------------------------
# A friendly display name for the shipped demo dataset (the raw db_id is jargon).
_DEMO_DATASET_NAME = "Maria's Pet Shop — orders & inventory (demo dataset)"
def _build_demo_card() -> DataCard | None:
"""Profile the preloaded demo dataset into a display-only ``DataCard``.
Runs the deterministic F003 profiler over the committed ``retail_smb`` DB's
``orders`` table so the app can orient the user ("here's your data") on load.
Returns ``None`` (never raises) if the demo DB is missing, so a build can't
fail on a packaging gap — the card region just stays empty in that case.
"""
db_state = _default_db_state()
db_path = Path(db_state["db_path"])
if not db_path.exists():
return None
try:
return propose_data_card(
db_path,
db_id=db_state["db_id"],
table=db_state["table"],
)
except (ValueError, FileNotFoundError):
return None
def _render_demo_card_markdown(card: DataCard | None) -> str:
"""Render the loaded-dataset orientation card (name + profile + assumptions).
DISPLAY-ONLY (FR3): names the demo dataset, shows its row count, and lists the
per-column best-guess assumptions via ``_render_card_markdown``. Falls back to
a quiet line if the demo card couldn't be built (missing DB).
"""
if card is None:
return f"**Loaded dataset:** {_DEMO_DATASET_NAME}."
header = (
f"**Loaded dataset:** {_DEMO_DATASET_NAME} — "
f"{card.row_count} rows in `{card.table}`, "
f"{len(card.columns)} columns."
)
return header + "\n\n" + _render_card_markdown(card, editable=False)
def _handle_upload(file_obj, db_state: dict) -> tuple:
"""Upload handler: ``ingest_csv(...)`` -> ``propose_data_card(...)``.
Returns ``(new_db_state, proposed_card_or_None, status_text)``. The new
db_state points the next ask at the uploaded DB. Catches
``ValueError``/``FileNotFoundError`` from ``ingest_csv`` and returns a plain
'couldn't read that file' message with the db_state UNCHANGED (never crashes).
"""
source = _file_source(file_obj)
if source is None:
return db_state, None, "Couldn't read that file — is it a CSV?"
try:
result = ingest_csv(source, root=str(_UPLOAD_ROOT), if_exists="replace")
card = propose_data_card(
result.db_path,
db_id=result.db_id,
table=result.table,
column_mapping=result.column_mapping,
)
except (ValueError, FileNotFoundError):
return db_state, None, "Couldn't read that file — is it a CSV?"
new_db_state = {
"db_id": result.db_id,
"db_path": str(result.db_path),
"db_dir": str(_UPLOAD_ROOT),
"table": result.table,
}
status = f"Loaded **{result.row_count} rows** from your file. Ask away below."
return new_db_state, card, status
def _file_source(file_obj) -> str | bytes | None:
"""Normalize a gr.File value (path str, a temp-file object, or None)."""
if file_obj is None:
return None
if isinstance(file_obj, str):
return file_obj
name = getattr(file_obj, "name", None)
return name if isinstance(name, str) else None
def _handle_accept_card(card_state: DataCard | None, table_description: str) -> tuple:
"""Accept/edit handler: ``apply_card_edits`` -> confirmed ``DataCard``.
Merges the user's one-line description over the proposed card (user wins; the
deterministic profile is never overwritten) and returns
``(confirmed_card, status_text)``. DISPLAY-ONLY: the confirmed card is stored
for the UI, NOT injected into ``get_system_prompt`` (ADR 0007 — the trained
policy never saw a data-context block; that wiring is deferred to F006 via
``render_data_context``).
"""
if card_state is None:
return None, "Upload a CSV first to see and confirm its summary."
edits: dict = {}
if table_description.strip():
edits["table_description"] = table_description.strip()
confirmed = apply_card_edits(card_state, edits)
return confirmed, "Saved — your summary is confirmed (it's advisory only)."
# ---------------------------------------------------------------------------
# build_demo — construct (do NOT launch) the single-column Blocks demo.
# ---------------------------------------------------------------------------
# ZeroGPU: the whole agentic loop runs in ONE @spaces.GPU window per question
# (ZeroGPU spawns a fresh GPU process per call with NO cross-call KV cache, so we
# never split the 4-6 tool-call turns across calls). 120s is generous headroom for
# the multi-turn loop incl. first-call model materialization; the GPU time actually
# billed is real usage, not the reservation.
_GPU_DURATION_S = 120
def _run_rollout_gpu(
question: str, db_dir: str, db_id: str, model_key: str
) -> Iterator[Step]:
"""Stream one full real-model agentic rollout, yielding a ``Step`` per tool call.
A GENERATOR so the UI shows steps LIVE as the model produces them. When wrapped
by ``@spaces.GPU`` the ZeroGPU allocation is held for the whole generator — one
GPU window for the 4-6 turns (ZeroGPU has no cross-call KV cache), but each step
streams back as it resolves instead of all at once at the end.
Built from PRIMITIVES (no live objects) so it is safe inside the fork: it builds
the env + a cache-backed ``ModelPolicy`` here and drives ``run_agent_turn`` (the
SAME loop the demo + eval use). ``serving.get_policy`` reuses ``ModelPolicy`` ->
``server/tooling.py`` so the tool-call format matches training exactly (the
"never write a second parser" invariant).
"""
try:
from .serving import get_policy
except ImportError: # pragma: no cover - flat-layout fallback
from server.serving import get_policy # type: ignore[no-redef]
env = _build_environment(db_dir)
env.begin_episode(db_id, question)
policy = get_policy(model_key)
yield from run_agent_turn(question, env, policy, budget=_STEP_BUDGET)
def build_demo(
policy_factory: Callable[[], Policy] | None = None,
*,
enable_model_selector: bool = False,
) -> gr.Blocks:
"""Construct (do NOT launch) the single-column Blocks demo.
Injects the policy via ``policy_factory`` (called once per ask). Defaults to a
deterministic demo-script policy (NO torch import) so the app builds and tests
run headless with no model. Returns a ``gr.Blocks`` instance; the caller
launches it via ``_launch_demo`` so the stashed theme + CSS reach ``.launch()``
(gradio 6.x).
``enable_model_selector`` (F006): when True, render a model dropdown and route a
real-model ask through ``serving.get_policy`` inside one ``@spaces.GPU`` window
(on a ZeroGPU Space; a no-op decorator off-Space). The default (False) is
byte-identical to the no-model demo so the existing headless tests are unchanged.
"""
factory = policy_factory or _default_policy_factory
# Real-model serving is opt-in (the Space passes enable_model_selector=True).
# Import the serving registry + wrap the rollout with ZeroGPU LAZILY here so the
# default headless build stays torch/spaces-free (the dep-light invariant).
serving = None
gpu_rollout = _run_rollout_gpu
if enable_model_selector:
try:
from . import serving as serving # type: ignore[no-redef]
except ImportError: # pragma: no cover - flat-layout fallback
from server import serving as serving # type: ignore[no-redef]
try:
import spaces # noqa: PLC0415
gpu_rollout = spaces.GPU(duration=_GPU_DURATION_S)(_run_rollout_gpu)
except ImportError:
# Not on a ZeroGPU Space (e.g. local dev): run the rollout undecorated.
gpu_rollout = _run_rollout_gpu
# FR1 (WCAG AA): override the primary-button accent to a darker orange so the
# white CTA label clears 4.5:1 (the default orange-500 was 2.80:1). #C2410C =
# 5.18:1, hover #9A3412 = 7.31:1 (white-on-accent). Single semantic accent.
# NOTE: gradio 6.x deprecated `theme`/`css` on the Blocks constructor in favour
# of `.launch()` — BUT the constructor still ACCEPTS them via a back-compat shim
# that stows them in `self._deprecated_{css,theme}`, and `.launch()` falls back
# to exactly those when its own `css`/`theme` args are None. So passing them to
# the constructor (below) makes the theme + CSS apply no matter HOW the app is
# launched — `_launch_demo`, `gradio app.py` hot-reload, a deploy target, or HF
# Spaces' own auto-`.launch()` — instead of only when our launch site threads
# them. (A launch-path that bypassed `_launch_demo` was shipping an unstyled,
# un-themed page.) `_launch_demo` still passes them too; identical value, no-op.
theme = gr.themes.Soft(
primary_hue="orange",
neutral_hue="slate",
spacing_size="sm",
radius_size="sm",
# IBM Plex Sans/Mono — the design handover's typeface (warmer + more
# characterful than system-ui), loaded via GoogleFont with a system-ui
# fallback so a blocked CDN degrades gracefully rather than breaking paint.
font=[gr.themes.GoogleFont("IBM Plex Sans"), "system-ui", "sans-serif"],
font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "monospace"],
).set(
# Tighten the bare-Soft "prototype" look: flat blocks, lighter titles.
block_shadow="none",
block_title_text_weight="500",
# Single semantic accent; white label clears AA on #C2410C (5.18:1).
button_primary_background_fill=_ACCENT,
button_primary_background_fill_hover=_ACCENT_HOVER,
button_primary_text_color="white",
button_primary_text_color_hover="white",
)
with gr.Blocks(
title="analyst-buddy", fill_width=False, theme=theme, css=_APP_CSS
) as demo:
# Per-session DB pointer: which data the next ask runs against.
db_state = gr.State(_default_db_state())
# Confirmed advisory data card (display-only; never injected into the prompt).
card_state: gr.State = gr.State(None)
# Snapshot of the answer currently on screen — {question, dataset, answer}
# captured at ask time so feedback is bound to the answer being voted on,
# NOT the live (possibly chip-edited) question textbox (staff review M2/M3).
asked_state: gr.State = gr.State({})
# F006: which model the next ask runs against. Hosted default = the
# FINE-TUNED Qwen3-1.7B (published to the Hub 2026-06-14), so judges see the
# actual product, not the 4%-correct off-the-shelf model. Demo + vanilla
# stay selectable. Only created when the selector is enabled.
model_state: gr.State | None = (
gr.State("sqlenv-1.7b-grpo-v2") if enable_model_selector else None
)
_default_persona = personas.default_persona_key()
_db_root = str(_DB_ROOT)
# --- Top "board": hero + ask + examples + personas + data overview wrapped
# in ONE bordered card, so the whole "value prop → ask → samples" flow reads
# as a single unit (user request). ----------------------------------------
with gr.Group(elem_classes="ab-top-card"):
gr.HTML(_HERO_HTML_HOSTED if enable_model_selector else _HERO_HTML_LOCAL)
# Ask box (chat-composer style: a roomy input + a simple send button).
# No autofocus: it scrolled the hero off-screen on load (01-ux-polish S1).
with gr.Row(elem_classes="ab-ask-row"):
question = gr.Textbox(
show_label=False,
container=False,
placeholder="e.g. Which store has the most orders?",
lines=2,
max_lines=6,
scale=6,
)
ask_btn = gr.Button("Ask →", variant="primary", scale=1)
# Animated working indicator (CSS spinner) — hidden until an ask runs.
status = gr.HTML("", visible=False)
# Per-persona example questions — relabelled on persona change; each fills
# the question box. No heading (it's obvious these are examples).
with gr.Row(elem_classes="ab-starter-row"):
starter_btns = [
gr.Button(text, variant="secondary", size="sm")
for text in personas.starters_for(_default_persona)
]
# Persona picker (samples to try; CSV "soon" as a dashed chip).
gr.Markdown(
"**Try it on a sample business — it works on multiple databases:**",
elem_classes="ab-center",
)
with gr.Row(elem_classes="ab-persona-row"):
persona_btns = {
persona.key: gr.Button(
persona.label, variant="secondary", size="sm"
)
for persona in personas.PERSONAS.values()
}
gr.Button(
"Your data (CSV) · soon",
variant="secondary",
size="sm",
interactive=False,
elem_classes="ab-csv-soon",
)
persona_desc = gr.Markdown(
f"_{personas.get_persona(_default_persona).blurb}_",
elem_classes="ab-center",
)
with gr.Accordion("What's in this data", open=False):
demo_dataset_view = gr.HTML(
personas.persona_card_html(_default_persona, _db_root)
)
# Upload is "coming soon" (cued by the dashed CSV chip above). The real
# upload + data-card components live here, hidden, until it's ready — kept in
# the tree so the wiring + upload-flow tests still resolve.
with gr.Group(visible=False):
upload = gr.File(
label="Upload your CSV (coming soon)",
file_types=[".csv"],
file_count="single",
interactive=False,
)
upload_status = gr.Markdown("")
data_card_view = gr.Markdown("")
card_edit = gr.Textbox(
label="What's in this data? (optional one-line description)",
placeholder="e.g. weekly sales rows, one per store",
visible=False,
)
accept_card_btn = gr.Button(
"Accept this summary", variant="secondary", visible=False
)
# The model picker lives in the quiet "settings" footer at the very bottom
# (built after the answer zone, next to "Why a tiny model is enough").
model_dropdown = None
# Answer zone: ONE block, revealed at ask START (see _set_busy). The live
# step-trace streams FIRST (above), then the Result appears below it — the
# layout the user preferred. _set_busy also clears the previous trace/result
# the moment Ask is clicked, so stale content never lingers.
# - trace_acc (the numbered steps + the exact SQL): streams above; stays
# open so the steps are visible (no auto-collapse).
# - result_group (the Result table + chart + provenance): hidden until the
# answer completes, so no empty table flickers during streaming.
# - feedback_group: revealed only once an answer exists.
with gr.Group(visible=False) as answer_group:
# "Show its work": the live numbered trace (layout="panel" — a tool-call
# trace, not a chat) + the exact SQL it wrote. Streams above the answer.
with gr.Accordion(
"🔎 How it figured this out — its steps + the exact SQL", open=True
) as trace_acc:
chatbot = gr.Chatbot(
label="How it figured this out",
show_label=False,
height=300,
layout="panel",
elem_classes="ab-trace",
)
sql_code = gr.Code(
label="The exact query it wrote",
language="sql",
elem_classes="ab-sql",
)
with gr.Group(
visible=False, elem_classes="ab-answer-card"
) as result_group:
# Chart + table side by side (v3 answer card). ONE gr.Plot fed an
# interactive Altair (vega-lite) chart, hidden until a valid chart-
# intent renders (FR4) — a single component avoids the multi-turn
# ResizeObserver crash while keeping hover tooltips.
with gr.Row():
chart_plot = gr.Plot(show_label=False, visible=False)
result_table = gr.Dataframe(
label="Result", interactive=False, wrap=True
)
provenance = gr.Markdown(_EMPTY_PROVENANCE, elem_classes="ab-prov")
# --- Feedback flywheel: the verdict + correction sharpens the AGENT'S SQL
# skill (a labelled example) — we never train on the user's data (v3 lock).
with gr.Group(
visible=False, elem_classes="ab-feedback-bar"
) as feedback_group:
with gr.Row(elem_classes="ab-feedback-row"):
gr.Markdown("**Was this right?**", elem_classes="ab-fb-label")
fb_up = gr.Button("👍 Yes", variant="secondary", scale=0)
fb_down = gr.Button("👎 No", variant="secondary", scale=0)
# The reassurance, said ONCE — pushed to the right of the row.
gr.Markdown(
"_👍 / 👎 sharpens the agent locally · never trains on "
"your data._",
elem_classes="ab-fb-note",
)
# The prompt lives in the placeholder (a long `label=` rendered as a
# loud orange block); the reassurance is the note on the right above.
fb_correction = gr.Textbox(
show_label=False,
container=False,
lines=1,
placeholder=(
"What should it be? e.g. it should join orders to stores "
"on store_id"
),
visible=False,
)
fb_submit = gr.Button(
"Submit correction", variant="secondary", visible=False
)
fb_status = gr.Markdown("")
# --- Footer "settings": roadmap + model picker + the GRPO explainer ----
# Quiet, secondary controls grouped at the very bottom (the Tone-Bridge
# bottom-"Settings" pattern): the rarely-used model picker and the judge-
# facing "Why a tiny model is enough" live here, out of the main flow.
gr.Markdown(_ROADMAP_MD, elem_classes="ab-center ab-footer-note")
if enable_model_selector:
with gr.Accordion("Advanced — choose the model", open=False):
model_dropdown = gr.Dropdown(
choices=serving.dropdown_choices(),
value="sqlenv-1.7b-grpo-v2",
label="Model",
info=(
"Defaults to our fine-tuned Qwen3-1.7B (the real product). "
"Switch to the vanilla model to see the difference, or 'Demo' "
"for an instant offline walkthrough."
),
)
model_dropdown.change(
lambda k: k, inputs=[model_dropdown], outputs=[model_state]
)
_showcase_examples = _load_showcase()
run_live_btns: list[tuple] = []
with gr.Accordion(
"✨ What we're building: off-the-shelf 4% vs our fine-tune 49%",
open=False,
elem_id="why-tiny",
):
# ONE dark card: build copy + (when present) the head-to-head nested
# inside it, so the showcase's dark-tuned colours read in both themes.
gr.HTML(_why_card_html(_showcase_examples))
if _showcase_examples:
if enable_model_selector:
gr.Markdown(
"**Try it yourself — runs on the model picked in "
"'Advanced' above. Switch it to compare:**",
elem_classes="ab-center",
)
with gr.Row(elem_classes="ab-starter-row"):
for _ex in _showcase_examples[:3]:
_rb = gr.Button(
f"{_ex['question'][:38]}…",
size="sm",
variant="secondary",
)
run_live_btns.append((_rb, _ex["question"]))
# --- Ask wiring (Step 2.2) -----------------------------------------
def _ask(
question_text: str,
current_db: dict,
history: list,
model_key: str = "demo",
):
# Demo key (or selector off) -> the existing deterministic demo path.
# A real, available model -> drive ModelPolicy inside one GPU window
# (serving.get_policy reuses tooling.py -> training-format parity).
if (
enable_model_selector
and model_key
and model_key != serving.default_model_key()
):
spec = serving.get_spec(model_key)
if spec is not None and spec.available:
current = current_db or {}
db_dir = current.get("db_dir", "")
db_id = current.get("db_id", "")
def _rollout():
return gpu_rollout(question_text, db_dir, db_id, model_key)
yield from _handle_ask(
question_text, current_db, history, factory, rollout=_rollout
)
return
# Selected a not-yet-published model -> honest demo fallback.
yield from _handle_ask(question_text, current_db, history, factory)
ask_outputs = [
chatbot,
result_table,
sql_code,
provenance,
chart_plot,
]
def _capture_question(question_text: str, current_db: dict) -> dict:
"""Snapshot the asked question + dataset BEFORE streaming, so feedback
binds to this answer even if the textbox is later chip-edited (M2)."""
return {
"question": question_text or "",
"dataset": (current_db or {}).get("db_id", "?"),
"answer": "",
}
def _finish_ask(history: list, asked: dict):
"""After streaming: reveal the result + feedback and collapse the live
trace into its expander (m3 clears any stale feedback status). The answer
text is still captured into the snapshot for the feedback record, but it
is NOT shown as a prose line — the Result table is the answer surface."""
answer = _last_answer_text(history)
snapshot = {**(asked or {}), "answer": answer}
return (
snapshot,
gr.update(visible=True), # result_group (table + chart) appears below
gr.update(visible=True), # feedback_group
"", # fb_status
)
# Make "working" unmissable: disable + relabel the button AND show an animated
# CSS spinner. Reveal the answer block + open the trace so the live steps stream
# ABOVE; keep result + feedback hidden until _finish_ask. CRUCIAL: clear the
# PREVIOUS trace/result NOW so the old answer vanishes the moment Ask is clicked
# (rather than lingering until the next answer completes).
def _set_busy():
return (
gr.update(value="Working…", interactive=False),
gr.update(value=_SPINNER_HTML, visible=True),
gr.update(visible=True), # answer_group
gr.update(visible=False), # result_group (wait for the answer)
gr.update(open=True), # trace_acc (open while streaming)
gr.update(visible=False), # feedback_group (wait for the answer)
[], # chatbot — clear the old steps immediately
_empty_table(), # result_table — clear the old rows
"", # sql_code — clear the old query
gr.update(value=_EMPTY_PROVENANCE), # provenance reset
gr.update(visible=False), # chart_plot — hide the old chart
)
def _set_idle():
return (
gr.update(value="Ask →", interactive=True),
gr.update(value="", visible=False),
)
# When the selector is on, the chosen model travels into _ask via model_state
# (appended last so _ask's model_key default holds when the selector is off).
ask_inputs = [question, db_state, chatbot]
if enable_model_selector:
ask_inputs = [question, db_state, chatbot, model_state]
# busy (reveal block + open trace) → capture → stream → finish (answer leads,
# reveal results + feedback, collapse trace) → idle
_busy_outputs = [
ask_btn,
status,
answer_group,
result_group,
trace_acc,
feedback_group,
chatbot,
result_table,
sql_code,
provenance,
chart_plot,
]
_finish_outputs = [
asked_state,
result_group,
feedback_group,
fb_status,
]
ask_btn.click(_set_busy, outputs=_busy_outputs).then(
_capture_question, inputs=[question, db_state], outputs=[asked_state]
).then(_ask, inputs=ask_inputs, outputs=ask_outputs).then(
_finish_ask,
inputs=[chatbot, asked_state],
outputs=_finish_outputs,
).then(_set_idle, outputs=[ask_btn, status])
question.submit(_set_busy, outputs=_busy_outputs).then(
_capture_question, inputs=[question, db_state], outputs=[asked_state]
).then(_ask, inputs=ask_inputs, outputs=ask_outputs).then(
_finish_ask,
inputs=[chatbot, asked_state],
outputs=_finish_outputs,
).then(_set_idle, outputs=[ask_btn, status])
# Showcase "run this live": load the question, reset to the shop persona
# (so db_state points at retail_smb), force the fine-tuned model, then run
# the normal ask chain — so the judge watches our model answer it for real.
if enable_model_selector and run_live_btns:
def _run_live(q: str):
# Fill the question + switch to the retail demo dataset, but DO NOT
# touch model_state — the ask runs on whatever the "Advanced" dropdown
# has selected, so you can flip vanilla ↔ fine-tuned and compare on the
# same showcase questions.
new_state = personas.db_state_for("shop", _db_root)
card = personas.persona_card_html("shop", _db_root)
desc = f"_{personas.get_persona('shop').blurb}_"
labels = personas.starters_for("shop")
return (
q,
new_state,
card,
desc,
*[gr.update(value=t, visible=bool(t)) for t in labels],
)
_rl_outputs = [
question,
db_state,
demo_dataset_view,
persona_desc,
*starter_btns,
]
_cap_in = [question, db_state]
for _rb, _q in run_live_btns:
_rb.click(lambda q=_q: _run_live(q), outputs=_rl_outputs).then(
_set_busy, outputs=_busy_outputs
).then(
_capture_question, inputs=_cap_in, outputs=[asked_state]
).then(_ask, inputs=ask_inputs, outputs=ask_outputs).then(
_finish_ask,
inputs=[chatbot, asked_state],
outputs=_finish_outputs,
).then(_set_idle, outputs=[ask_btn, status])
# Persona switching: rebind the dataset, refresh the overview card, the
# one-line description, and the starter chips. One handler per persona card.
def _on_persona(persona_key: str):
new_state = personas.db_state_for(persona_key, _db_root)
card = personas.persona_card_html(persona_key, _db_root)
desc = f"_{personas.get_persona(persona_key).blurb}_"
labels = personas.starters_for(persona_key)
return (
new_state,
card,
desc,
*[gr.update(value=text, visible=bool(text)) for text in labels],
)
for _pkey, _pbtn in persona_btns.items():
_pbtn.click(
lambda key=_pkey: _on_persona(key),
outputs=[db_state, demo_dataset_view, persona_desc, *starter_btns],
)
# Each starter button fills the question box with its own current label.
for _starter in starter_btns:
_starter.click(lambda text: text, inputs=[_starter], outputs=[question])
# --- Feedback wiring (the flywheel) --------------------------------
def _fb_line() -> str:
n = feedback_summary().get("training_candidates", 0)
if not n:
return ""
return f" _{n} correction(s) collected so far — each sharpens the agent._"
def _on_thumbs_up(asked: dict, sql: str):
asked = asked or {}
record_feedback(
question=asked.get("question", ""),
dataset=asked.get("dataset", "?"),
shown_sql=sql or "",
result=asked.get("answer", ""),
verdict="up",
)
return (
f"✓ Thanks — logged.{_fb_line()}",
gr.update(visible=False),
gr.update(visible=False),
)
def _on_thumbs_down():
return (
"Tell me what it should be — it sharpens the agent (never your data).",
gr.update(visible=True),
gr.update(visible=True),
)
def _on_correction(asked: dict, sql: str, correction: str):
asked = asked or {}
record_feedback(
question=asked.get("question", ""),
dataset=asked.get("dataset", "?"),
shown_sql=sql or "",
result=asked.get("answer", ""),
verdict="down",
correction=correction or "",
)
return (
f"✓ Captured — this sharpens the agent.{_fb_line()}",
gr.update(visible=False, value=""),
gr.update(visible=False),
)
fb_up.click(
_on_thumbs_up,
inputs=[asked_state, sql_code],
outputs=[fb_status, fb_correction, fb_submit],
)
fb_down.click(
_on_thumbs_down,
outputs=[fb_status, fb_correction, fb_submit],
)
fb_submit.click(
_on_correction,
inputs=[asked_state, sql_code, fb_correction],
outputs=[fb_status, fb_correction, fb_submit],
)
# --- Upload + data-card wiring (Steps 3.1 / 3.2) -------------------
def _on_upload(file_obj, current_db: dict):
new_db, card, status = _handle_upload(file_obj, current_db)
card_markdown = _render_card_markdown(card) if card is not None else ""
return new_db, card, status, card_markdown
upload.upload(
_on_upload,
inputs=[upload, db_state],
outputs=[db_state, card_state, upload_status, data_card_view],
)
def _on_accept(card_obj, description: str):
confirmed, status = _handle_accept_card(card_obj, description)
card_markdown = (
_render_card_markdown(confirmed) if confirmed is not None else ""
)
return confirmed, status, card_markdown
accept_card_btn.click(
_on_accept,
inputs=[card_state, card_edit],
outputs=[card_state, upload_status, data_card_view],
)
# Keep references so introspection/tests can find every component.
demo.analyst_components = { # type: ignore[attr-defined]
"db_state": db_state,
"card_state": card_state,
"demo_dataset_view": demo_dataset_view,
"upload": upload,
"upload_status": upload_status,
"data_card_view": data_card_view,
"card_edit": card_edit,
"accept_card_btn": accept_card_btn,
"persona_btns": persona_btns,
"persona_desc": persona_desc,
"starter_btns": starter_btns,
"status": status,
"question": question,
"ask_btn": ask_btn,
"chatbot": chatbot,
"result_table": result_table,
"provenance": provenance,
"sql_code": sql_code,
"chart_plot": chart_plot,
"fb_up": fb_up,
"fb_down": fb_down,
"fb_correction": fb_correction,
"fb_submit": fb_submit,
"fb_status": fb_status,
"answer_group": answer_group,
"result_group": result_group,
"trace_acc": trace_acc,
"feedback_group": feedback_group,
"asked_state": asked_state,
}
if model_dropdown is not None:
demo.analyst_components["model_dropdown"] = model_dropdown
demo.analyst_policy_factory = factory # type: ignore[attr-defined]
# FR1: theme/css travel to .launch() (gradio 6.x ignores them on Blocks).
demo.analyst_theme = theme # type: ignore[attr-defined]
demo.analyst_css = _APP_CSS # type: ignore[attr-defined]
return demo
def _launch_demo(demo: gr.Blocks, **launch_kwargs) -> None:
"""Launch ``demo`` applying the stashed theme + CSS (FR1, gradio 6.x).
gradio 6.x ignores ``theme``/``css`` on the ``Blocks`` constructor and reads
them at ``.launch()`` time, so the single launch path threads the theme/css
that ``build_demo`` attached. Any extra ``launch_kwargs`` (e.g. ``server_port``)
pass through unchanged.
"""
demo.launch(
theme=getattr(demo, "analyst_theme", None),
css=getattr(demo, "analyst_css", None),
**launch_kwargs,
)
def _default_db_state() -> dict:
"""The preloaded demo dataset pointer used until the user uploads a CSV."""
return {
"db_id": _DEMO_DB_ID,
"db_path": str(_DB_ROOT / _DEMO_DB_ID / f"{_DEMO_DB_ID}.sqlite"),
"db_dir": str(_DB_ROOT),
"table": "orders",
}
def _build_environment(db_dir: str) -> SQLEnvironment:
"""Build a SQLEnvironment rooted at ``db_dir`` for non-gold (user) episodes.
``SQLEnvironment.__init__`` requires a questions file (the gold path), but the
demo only ever uses ``begin_episode`` + ``step`` — never ``reset()``. We point
it at the repo questions file when present, else the COMMITTED
``data/demo_questions.json`` (Q2: no per-ask write into the user-upload dir),
so the constructor's gold-path validation passes without affecting the
non-gold loop.
"""
questions_path = _QUESTIONS_PATH
if not questions_path.exists():
questions_path = _DEMO_QUESTIONS_PATH
from .mock_tokenizer import MockTokenizer # local import: dep-light path
return SQLEnvironment(
questions_path=str(questions_path),
db_dir=db_dir,
tokenizer=MockTokenizer(),
step_budget=_STEP_BUDGET,
)
if __name__ == "__main__":
_launch_demo(build_demo())