Spaces:
Running on Zero
Running on Zero
| """analyst-buddy — ask your data in plain English, locally (F004). | |
| The single-column Gradio demo (the judged deliverable). It streams the data | |
| exploration as collapsible chat steps, shows the result table, and keeps **the | |
| query it wrote one click away from every answer — on success AND on error** (the | |
| P0 element of the experience brief). It runs with no model by default via an | |
| injected deterministic demo-script policy; the real model swaps in at F006 through the | |
| ``build_demo(policy_factory=...)`` seam. | |
| This is the ONLY gradio-aware module. The streaming runner (``agent_loop``) and | |
| the SQL machinery stay gradio-free. Keep model/RL/exploration jargon OUT of the | |
| primary surface (Content Contract): the chat titles speak plain English. | |
| This is the single canonical implementation module (``sql_env.server.app_ui``). | |
| The repo-root ``app.py`` is a thin shim re-exporting ``build_demo`` so there is | |
| exactly ONE ``Step``/class identity in the process (no dual-import footgun). | |
| Run locally: ``uv run python app.py``. | |
| """ | |
| from __future__ import annotations | |
| from collections.abc import Callable, Iterable, Iterator | |
| import dataclasses | |
| import html | |
| import json | |
| from pathlib import Path | |
| import re | |
| import altair as alt | |
| import gradio as gr | |
| import pandas as pd | |
| try: | |
| from ..evaluation.policies import Policy | |
| from ..models import SQLAction, SQLObservation | |
| from .agent_loop import Step, run_agent_turn | |
| from .chart_intent import ChartSpec, parse_chart_intent, validate_intent | |
| from .data_card import DataCard, apply_card_edits, propose_data_card | |
| from .feedback import feedback_summary, record_feedback | |
| from .ingestion import ingest_csv | |
| from .sql_environment import SQLEnvironment | |
| from .sql_ident import quote_ident | |
| from . import personas | |
| except ImportError: # pragma: no cover - flat-layout / direct-run fallback | |
| from evaluation.policies import Policy # type: ignore[no-redef] | |
| from models import SQLAction, SQLObservation # type: ignore[no-redef] | |
| from server.agent_loop import Step, run_agent_turn # type: ignore[no-redef] | |
| from server.chart_intent import ( # type: ignore[no-redef] | |
| ChartSpec, | |
| parse_chart_intent, | |
| validate_intent, | |
| ) | |
| from server.data_card import ( # type: ignore[no-redef] | |
| DataCard, | |
| apply_card_edits, | |
| propose_data_card, | |
| ) | |
| from server.feedback import ( # type: ignore[no-redef] | |
| feedback_summary, | |
| record_feedback, | |
| ) | |
| from server.ingestion import ingest_csv # type: ignore[no-redef] | |
| from server.sql_environment import SQLEnvironment # type: ignore[no-redef] | |
| from server.sql_ident import quote_ident # type: ignore[no-redef] | |
| from server import personas # type: ignore[no-redef] | |
| # --------------------------------------------------------------------------- | |
| # Demo dataset + paths (the curated retail-SMB set from F009 — Maria's pet shop). | |
| # --------------------------------------------------------------------------- | |
| _REPO_ROOT = Path(__file__).resolve().parents[1] | |
| _DB_ROOT = _REPO_ROOT / "data" / "databases" | |
| _UPLOAD_ROOT = _REPO_ROOT / "data" / "uploads" | |
| # A small COMMITTED demo questions file (mirrors the env's questions-file schema) | |
| # so the env constructor's gold-path validation passes WITHOUT writing into the | |
| # user-upload dir (Q2). The demo only ever uses begin_episode/step (non-gold). | |
| _QUESTIONS_PATH = _REPO_ROOT / "data" / "questions.json" | |
| _DEMO_QUESTIONS_PATH = _REPO_ROOT / "data" / "demo_questions.json" | |
| _DEMO_DB_ID = "retail_smb" | |
| # Cap the agent's tool-call turns. Kept tight so a weak model (e.g. vanilla 0.6B) | |
| # that can't form a valid query can't spin for long on parse-failure retries — a | |
| # capable model answers single/2-table questions in ~3-6 steps (training used 10). | |
| _STEP_BUDGET = 8 | |
| # Example decision-questions (framed as owner decisions, no jargon), each scripted | |
| # by _DemoScriptPolicy below against the retail_smb dataset (Maria's pet shop). | |
| # One click each. Question 2 intentionally shows an error → self-correction beat | |
| # (the P0 "SQL shown on error" demo) — see _DEMO_SCRIPTS. | |
| _EXAMPLE_QUESTIONS = [ | |
| "Which store has the most orders?", | |
| "What was my revenue each month?", | |
| "Which product makes the most revenue?", | |
| ] | |
| _HEADLINE = "# Ask your data — in plain English, on your own machine" | |
| _SUBHEAD = ( | |
| "Type a question about your data and get an answer, the result table, and " | |
| "**the exact query it wrote** — in seconds. No SQL or spreadsheets needed." | |
| ) | |
| _PRIVACY_CUE = ( | |
| "🔒 **Runs locally — your data stays here.** Nothing is uploaded to the cloud." | |
| ) | |
| # Hosted-Space copy (F006/F008): on a ZeroGPU Space the model runs in the cloud, so | |
| # the "runs locally / nothing uploaded" claim above is NOT true. Keep the honest | |
| # differentiator — a small open model you COULD run yourself — without claiming this | |
| # hosted demo is local. Used only when enable_model_selector is on. | |
| _HEADLINE_HOSTED = "# Ask your data — in plain English" | |
| _PRIVACY_CUE_HOSTED = ( | |
| "🤗 **A small open model (Qwen3) — small enough to run on your own machine.** " | |
| "This is the hosted demo, running on Hugging Face so you can try it." | |
| ) | |
| _EMPTY_PROVENANCE = "_Ask a question to see the answer and the query it wrote._" | |
| # Hero (v3 design handover "Copy locked"): a segmented pill carrying the | |
| # differentiation (fine-tuned for SQL) + the two-line headline + Maria's tagline. | |
| # Rendered as ONE gr.HTML (static, no GPU); styling in _APP_CSS (.ab-hero/.ab-pill). | |
| # The pill claims are CAPABILITY-framed, not deployment claims: a "100% local / | |
| # data never leaves your laptop" badge is FALSE on the hosted ZeroGPU Space (the | |
| # question + data are sent to the cloud to run there). "Open & tiny / small enough | |
| # for your laptop" is true everywhere — it describes the MODEL, not where this demo | |
| # happens to run — so both variants share it. | |
| _TAGLINE = "A tiny AI analyst reads your tables, writes the SQL, and shows its work." | |
| _HEADLINE_HTML = "<h1>Ask your data,<br>get answers in plain English.</h1>" | |
| _HERO_PILL = ( | |
| '<div class="ab-pill">' | |
| "<span>Open & tiny</span>" | |
| '<span class="mid">Qwen3-1.7B, fine-tuned for SQL</span>' | |
| "<span>small enough for your laptop</span>" | |
| "</div>" | |
| ) | |
| _HERO_HTML_LOCAL = ( | |
| '<div class="ab-hero">' + _HERO_PILL + _HEADLINE_HTML + f"<p>{_TAGLINE}</p></div>" | |
| ) | |
| _HERO_HTML_HOSTED = _HERO_HTML_LOCAL | |
| # Judge-facing GRPO story — one click away (v3: "Why a tiny model is enough"), | |
| # rendered as a self-contained DARK card (theme-independent) per the designer's mock: | |
| # an amber eyebrow + light body with white emphasis. | |
| # "What we're building" card (v4, judge-facing): names the product, states the | |
| # HONEST work-in-progress status (incl. the v3 overfit regression), and carries the | |
| # measured 4%→49% proof. Copy from a judged design panel; hosted-honest (the laptop | |
| # line is a CAPABILITY — the cloud demo does NOT claim to run locally). | |
| _BUILD_HEADER = "A pocket-sized analyst, fine-tuned" | |
| _BUILD_WHAT = ( | |
| "We're building a tiny SQL analyst for shop owners: ask a question about your " | |
| "business data and it reads your tables, writes the SQL, checks its own answer, " | |
| "and shows its work. We took a small open model (Qwen3-1.7B) and fine-tuned it " | |
| "(SFT + GRPO) for this one job. It's small enough to run on your own laptop." | |
| ) | |
| _BUILD_PROOF = ( | |
| "On the app's sample business questions, an off-the-shelf Qwen3-1.7B gets " | |
| "<b>~4%</b> correct. After fine-tuning the same model gets <b>~49%</b>, an " | |
| "improvement of about 11×, and it improves on every sample dataset " | |
| "(pet-shop retail: 0% → 42%)." | |
| ) | |
| _BUILD_WIP = ( | |
| "Current status: one- and two-table questions are reliable, three-table joins " | |
| "are still a weak spot, and a longer training run showed signs of overfitting " | |
| "that we're analyzing." | |
| ) | |
| _HEAD2HEAD_INTRO = ( | |
| "A side-by-side on the same question: the same tiny model before and after " | |
| "fine-tuning. Off-the-shelf, it gives up or guesses wrong. Fine-tuned, it " | |
| "explores the data and answers." | |
| ) | |
| # The "what we're building" body (rendered inside the always-DARK card by | |
| # ``_why_card_html`` below, optionally with the head-to-head showcase nested in). | |
| _WHY_CARD_BODY = ( | |
| '<div class="eyebrow">What we\'re building</div>' | |
| f"<h3>{_BUILD_HEADER}</h3>" | |
| f"<p>{_BUILD_WHAT}</p>" | |
| f'<p class="ab-proof">{_BUILD_PROOF}</p>' | |
| f'<p class="ab-wip">{_BUILD_WIP}</p>' | |
| ) | |
| # Honest roadmap strip (v3 locked — deepen the agent, never train on user data). | |
| _ROADMAP_MD = ( | |
| "**What's next:** harder multi-table questions · more SQL dialects · the feedback " | |
| "loop sharpening the agent's SQL skill." | |
| ) | |
| # Single semantic accent (Visual Direction: "a single accent"). The shipped | |
| # orange-500 (#F97316) gives white-on-accent only 2.80:1 — fails WCAG AA | |
| # (needs >= 4.5:1). #C2410C (Tailwind orange-700) gives white-on-accent | |
| # 5.18:1 (computed via the WCAG relative-luminance formula), comfortably AA. | |
| # The hover shade (#9A3412, orange-800) is darker still (7.31:1), so the CTA | |
| # stays AA-compliant in every interaction state. | |
| _ACCENT = "#C2410C" | |
| _ACCENT_HOVER = "#9A3412" | |
| # Custom CSS: wrap the inline trace SQL on narrow viewports (FR5 — the SQL was | |
| # clipping on mobile inside the chat accordion; the dedicated panel already shows | |
| # it in full, this is polish so the inline copy wraps instead of overflowing). | |
| # Head-to-head showcase: pre-baked base-vs-fine-tuned traces (the judge proof), | |
| # generated offline (scripts/replay.py compare → data/showcase/…json) and rendered | |
| # STATIC (no GPU). One hero example shown in full + a compact "same pattern" list. | |
| _SHOWCASE_PATH = _REPO_ROOT / "data" / "showcase" / "retail_smb_base_vs_v2.json" | |
| def _load_showcase() -> list[dict]: | |
| """The curated base-vs-fine-tuned examples (empty list if the file is missing).""" | |
| try: | |
| data = json.loads(_SHOWCASE_PATH.read_text(encoding="utf-8")) | |
| return data if isinstance(data, list) else [] | |
| except (OSError, json.JSONDecodeError): | |
| return [] | |
| def _trace_html(trace: list[dict]) -> str: | |
| """One model's tool-call trace as compact rows (action · argument · result).""" | |
| rows = [] | |
| for s in trace: | |
| act = html.escape(str(s.get("action", ""))) | |
| arg = html.escape(str(s.get("argument", ""))[:64]) | |
| res = html.escape(str(s.get("result", ""))[:42]) | |
| rows.append( | |
| f'<div class="ab-step"><span class="act">{act}</span>' | |
| f'<code>{arg}</code><span class="res">{res}</span></div>' | |
| ) | |
| return "".join(rows) | |
| def _showcase_html(examples: list[dict]) -> str: | |
| """The head-to-head contrast: one hero example in full + a compact win list.""" | |
| if not examples: | |
| return "" | |
| hero = examples[0] | |
| def _col(side: dict, label: str, cls: str) -> str: | |
| ans = html.escape(str(side.get("final_answer"))) | |
| return ( | |
| f'<div class="ab-col {cls}"><div class="ab-col-h">{label}</div>' | |
| f'{_trace_html(side.get("trace", []))}' | |
| f'<div class="ab-final">answered <b>{ans}</b></div></div>' | |
| ) | |
| out = [f'<p class="ab-h2h-intro">{_HEAD2HEAD_INTRO}</p>'] | |
| out.append( | |
| '<div class="ab-h2h-hero">' | |
| f'<div class="ab-h2h-q">“{html.escape(hero["question"])}” ' | |
| '<span class="ab-gold">correct answer: ' | |
| f'{html.escape(str(hero["gold_answer"]))}</span></div>' | |
| '<div class="ab-h2h-cols">' | |
| + _col(hero["base"], "Off-the-shelf ✗", "ab-base") | |
| + _col(hero["v2"], "Fine-tuned ✓", "ab-v2") | |
| + "</div></div>" | |
| ) | |
| if len(examples) > 1: | |
| items = [] | |
| for ex in examples[1:]: | |
| base_ans = ex["base"].get("final_answer") | |
| base_ans = "gave up" if base_ans in (None, "—", "") else str(base_ans)[:22] | |
| v2_ans = html.escape(str(ex["v2"].get("final_answer"))[:22]) | |
| items.append( | |
| f'<li>“{html.escape(ex["question"])}” — ' | |
| f'off-the-shelf <span class="ab-x">{html.escape(base_ans)}</span> · ' | |
| f'ours <span class="ab-ok">{v2_ans} ✓</span></li>' | |
| ) | |
| out.append( | |
| '<div class="ab-h2h-more"><div class="ab-col-h">' | |
| "More questions, same pattern</div><ul>" + "".join(items) + "</ul></div>" | |
| ) | |
| return f'<div class="ab-h2h">{"".join(out)}</div>' | |
| def _why_card_html(examples: list[dict]) -> str: | |
| """The always-DARK "what we're building" card — the build copy PLUS (when present) | |
| the base-vs-fine-tuned head-to-head NESTED inside it. | |
| The showcase's contrast colours (white question, light intro, the win list) are | |
| tuned for a dark surface; nesting them in the dark card is what lets them read in | |
| BOTH light and dark themes (the card is intentionally dark in both — the same | |
| decision as the original "why a tiny model" card). Rendered outside the card, the | |
| light-on-light text vanished in light mode. | |
| """ | |
| body = _WHY_CARD_BODY | |
| if examples: | |
| body += _showcase_html(examples) | |
| return f'<div class="ab-why-card">{body}</div>' | |
| _APP_CSS = """ | |
| /* Theme-tuned accent: structural surfaces use Gradio's own theme variables (so they | |
| flip light/dark automatically); only the rust accent is hard-coded — and bumped | |
| BRIGHTER under .dark so it reads on navy. --ab-soft/--ab-soft-line are the warm | |
| translucent surface + border shared by the pill, accordions, data card + feedback | |
| bar. (Design handover: light+dark.) */ | |
| :root { | |
| --ab-rust: #bf3e11; --ab-rust-ink: #8f3b1c; | |
| --ab-soft: #fdf3ee; --ab-soft-line: #e7d2c7; | |
| } | |
| .dark { | |
| --ab-rust: #f0863a; --ab-rust-ink: #f0a868; | |
| --ab-soft: rgba(240, 134, 58, 0.10); --ab-soft-line: rgba(240, 134, 58, 0.28); | |
| } | |
| .gradio-container { | |
| width: 100% !important; | |
| max-width: 860px !important; | |
| margin: 0 auto !important; | |
| } | |
| footer { display: none !important; } | |
| .message-row pre, .message-row code { | |
| white-space: pre-wrap !important; | |
| word-break: break-word !important; | |
| overflow-wrap: anywhere !important; | |
| } | |
| @media (max-width: 768px) { | |
| .gradio-container { padding-left: 8px !important; padding-right: 8px !important; } | |
| } | |
| /* Subtle, theme-ADAPTIVE page background: a faint warm glow at the top, layered | |
| (translucent) over whatever the native light/dark background is — so the content | |
| "floats" without re-hardcoding light. Reads neat on both themes. */ | |
| body, gradio-app { | |
| background-image: radial-gradient( | |
| 130% 55% at 50% -8%, rgba(191, 62, 17, 0.06), transparent 60% | |
| ) !important; | |
| background-attachment: fixed !important; | |
| } | |
| /* 1. No double-box: neutralize the top "card" so the hero/ask/chips flow on the | |
| page — only the answer card + accordions stay framed. gr.Group wraps its content | |
| in an inner `.styler` that carries the slate group surface, so we transparent | |
| BOTH (the group div + its styler) or the panel lingers behind the hero. */ | |
| .ab-top-card, | |
| .ab-top-card > .styler { | |
| border: none !important; background: transparent !important; | |
| box-shadow: none !important; | |
| } | |
| .ab-top-card { padding: 8px 0 0 !important; } | |
| .ab-top-card > .styler { padding: 0 !important; } | |
| /* Animated "working" indicator — a real CSS spinner, shown while an ask runs. */ | |
| .ab-status { | |
| display: flex; align-items: center; gap: 10px; | |
| color: var(--ab-rust); font-weight: 500; padding: 6px 2px; | |
| } | |
| .ab-spinner { | |
| width: 18px; height: 18px; flex: none; | |
| border: 2.5px solid var(--ab-soft-line); | |
| border-top-color: var(--ab-rust); border-radius: 50%; | |
| animation: ab-spin 0.7s linear infinite; | |
| } | |
| @keyframes ab-spin { to { transform: rotate(360deg); } } | |
| /* Theme: we do NOT force light — the app follows the user's light/dark setting. | |
| Only RUST accents + a few translucent-warm surfaces (readable on either bg); | |
| structural text/surfaces use Gradio's own theme variables so nothing inverts. */ | |
| /* Hero: text adapts to the theme; the pill is a warm accent that reads on both. */ | |
| .ab-hero { text-align: center; padding: 6px 0 2px; } | |
| .ab-hero h1 { | |
| font-size: 32px; font-weight: 700; letter-spacing: -0.025em; | |
| color: var(--body-text-color); line-height: 1.12; margin: 2px 0 12px; | |
| } | |
| .ab-hero p { | |
| font-size: 17px; color: var(--body-text-color-subdued); line-height: 1.5; | |
| max-width: 540px; margin: 0 auto; | |
| } | |
| /* 2. Segmented pill — theme-tuned rust on the warm translucent surface. */ | |
| .ab-pill { | |
| display: flex; border: 1px solid var(--ab-soft-line); background: var(--ab-soft); | |
| border-radius: 999px; overflow: hidden; margin-bottom: 18px; | |
| } | |
| .ab-pill > span { | |
| flex: 1; display: flex; align-items: center; justify-content: center; | |
| text-align: center; padding: 10px 16px; font-size: 13px; font-weight: 600; | |
| color: var(--ab-rust-ink); border-right: 1px solid var(--ab-soft-line); | |
| } | |
| .ab-pill > span:last-child { border-right: 0; } | |
| .ab-pill > span.mid { color: var(--ab-rust); font-weight: 700; } | |
| /* Buttons: primary = AA rust; secondary chips get a rust hover/selected accent. */ | |
| .gradio-container button.primary { | |
| background: #C2410C !important; color: #ffffff !important; | |
| border: none !important; border-radius: 12px !important; | |
| } | |
| .gradio-container button.primary:hover { background: #9A3412 !important; } | |
| .gradio-container button.secondary:hover, | |
| .gradio-container button.secondary:focus, | |
| .gradio-container button.secondary:active { | |
| border-color: var(--ab-rust) !important; color: var(--ab-rust) !important; | |
| } | |
| /* Component labels ("Result", the SQL block label): quiet, not a loud chip. | |
| The SQL code label was a bright-orange filled pill — calm it to subdued text. */ | |
| .gradio-container .block-label, | |
| .gradio-container .ab-sql .block-label, | |
| .gradio-container .ab-sql label, | |
| .gradio-container .ab-sql [class*="label"] { | |
| background: var(--block-background-fill) !important; | |
| color: var(--body-text-color-subdued) !important; font-weight: 600; | |
| } | |
| .gradio-container .ab-sql .block-label svg { color: var(--body-text-color-subdued); } | |
| /* The SQL panel is one short query: "copy" (paste it into your own DB tool) is | |
| useful, but the built-in "download" just saves a stray file.txt — hide it, keep | |
| copy. */ | |
| .ab-sql button[title="Download"], .ab-sql a.download-link { display: none !important; } | |
| /* 4. Accordion headers: boxed, ONE rust chevron that rotates on open, connected | |
| panel. The warm translucent tint reads on light AND dark (theme-tuned vars). */ | |
| .gradio-container .label-wrap { | |
| border: 1px solid var(--ab-soft-line) !important; | |
| background: var(--ab-soft) !important; border-radius: 12px !important; | |
| padding: 13px 18px !important; font-weight: 600 !important; | |
| } | |
| .gradio-container .label-wrap .icon { | |
| color: var(--ab-rust) !important; transition: transform 0.2s ease; | |
| transform: rotate(0deg) !important; /* closed: clean ▼ (not a sideways ◄) */ | |
| } | |
| .gradio-container .label-wrap.open .icon { transform: rotate(180deg) !important; } | |
| .gradio-container .label-wrap.open { border-radius: 12px 12px 0 0 !important; } | |
| /* Kill the harsh default focus ring that rendered as a stark black/white box around | |
| the focused accordion (the whole block, via :focus-within) — keep a warm border. */ | |
| .gradio-container .label-wrap:focus, | |
| .gradio-container .label-wrap:focus-visible { outline: none !important; } | |
| .gradio-container .block:has(.label-wrap) { | |
| border-color: var(--ab-soft-line) !important; | |
| } | |
| .gradio-container .block:has(.label-wrap):focus-within { | |
| box-shadow: none !important; outline: none !important; | |
| } | |
| .gradio-container .label-wrap + div { | |
| border: 1px solid var(--border-color-primary); border-top: none; | |
| border-radius: 0 0 12px 12px; padding: 14px 18px; | |
| } | |
| /* Ask row: a roomy input + an inline rust button. A subtle shaded fill + clearer | |
| border + inner shadow make it obviously the place to type (it was blending into | |
| the page). */ | |
| .ab-ask-row { | |
| gap: 10px !important; align-items: stretch !important; margin: 4px 0 6px; | |
| } | |
| .ab-ask-row textarea { | |
| border-radius: 14px !important; padding: 16px 18px !important; | |
| font-size: 16px !important; min-height: 76px !important; | |
| line-height: 1.4 !important; | |
| background: var(--background-fill-secondary) !important; | |
| border: 1px solid var(--border-color-primary) !important; | |
| box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.05) !important; | |
| } | |
| .ab-ask-row textarea:focus { | |
| border-color: var(--ab-rust) !important; | |
| box-shadow: 0 0 0 3px rgba(240, 134, 58, 0.15) !important; | |
| } | |
| .ab-ask-row button.primary { | |
| padding: 0 30px !important; font-size: 16px !important; font-weight: 700; | |
| white-space: nowrap; min-height: 76px; | |
| } | |
| /* 3. Chips: real gaps + pill borders (stop the fused bar). */ | |
| .ab-starter-row { | |
| justify-content: center !important; flex-wrap: wrap; gap: 8px !important; | |
| margin: 6px 0 14px !important; | |
| } | |
| .ab-starter-row button.secondary { | |
| border: 1px solid var(--border-color-primary) !important; | |
| border-radius: 20px !important; background: var(--block-background-fill) !important; | |
| } | |
| .ab-center, .ab-center p { text-align: center !important; } | |
| /* Persona chips: centered, roomy, individually bordered; CSV "soon" dashed. */ | |
| .ab-persona-row { | |
| justify-content: center !important; gap: 10px !important; flex-wrap: wrap; | |
| margin: 8px 0 !important; | |
| } | |
| .ab-persona-row button { | |
| flex: 0 0 auto !important; min-width: 0 !important; width: auto !important; | |
| padding: 11px 18px !important; font-size: 15px !important; | |
| border-radius: 11px !important; | |
| } | |
| .gradio-container .ab-csv-soon button, | |
| .gradio-container .ab-csv-soon button:hover, | |
| .gradio-container .ab-csv-soon button:focus { | |
| border: 1px dashed var(--ab-soft-line) !important; | |
| background: var(--ab-soft) !important; | |
| color: var(--body-text-color-subdued) !important; | |
| cursor: default !important; border-radius: 11px; | |
| } | |
| /* Data overview ("what's in this data"): warm panel; tiles adapt to the theme. */ | |
| .ab-data-card { | |
| background: var(--ab-soft); border: 1px solid var(--ab-soft-line); | |
| border-radius: 12px; padding: 16px 18px; | |
| } | |
| .ab-data-card .ttl { font-size: 13px; font-weight: 700; color: var(--ab-rust); } | |
| .ab-data-card .sub { | |
| font-size: 13px; color: var(--body-text-color-subdued); font-style: italic; | |
| margin: 2px 0 12px; | |
| } | |
| .ab-tiles { display: grid; grid-template-columns: repeat(3, 1fr); gap: 8px; } | |
| .ab-tile { | |
| background: var(--block-background-fill); | |
| border: 1px solid var(--ab-soft-line); border-radius: 9px; padding: 10px 12px; | |
| } | |
| .ab-tile .nm { | |
| font-family: "IBM Plex Mono", monospace; font-size: 12.5px; | |
| font-weight: 700; color: var(--body-text-color); | |
| } | |
| .ab-tile .ct { font-size: 17px; font-weight: 700; color: var(--ab-rust); } | |
| .ab-tile .ct small { | |
| font-size: 10px; color: var(--body-text-color-subdued); font-weight: 500; | |
| } | |
| @media (max-width: 640px) { .ab-tiles { grid-template-columns: repeat(2, 1fr); } } | |
| /* Answer card: ONE clean framed panel with breathing room. Gradio nests gray | |
| block-fills (the card group, the plot container) that read as muddy boxes — flatten | |
| them to a single surface, then give the chart + table a real gap (they were 1px | |
| apart) and pad the card so nothing crowds its edges. */ | |
| .ab-answer-card { | |
| border: 1px solid var(--ab-soft-line) !important; | |
| border-radius: 14px !important; | |
| background: var(--background-fill-primary) !important; | |
| padding: 20px 22px !important; margin-top: 14px !important; | |
| } | |
| .ab-answer-card .gr-group, | |
| .ab-answer-card > .styler { | |
| background: transparent !important; border: none !important; | |
| } | |
| /* Chart | table: a real column gap so the result is not glued to the graph; TOP | |
| aligned so a short chart beside a tall table doesn't leave a gray half-column. */ | |
| .ab-answer-card .row { gap: 28px !important; align-items: flex-start !important; } | |
| .ab-answer-card .row > .block { | |
| background: transparent !important; border: none !important; | |
| box-shadow: none !important; | |
| } | |
| .ab-answer-card table { font-size: 13px !important; } | |
| .ab-answer-card td, .ab-answer-card th { padding: 7px 12px !important; } | |
| .ab-answer-card .ab-prov { margin-top: 14px !important; } | |
| .ab-prov { | |
| font-size: 12.5px; color: var(--body-text-color-subdued); | |
| background: var(--background-fill-secondary); border-radius: 8px; padding: 8px 12px; | |
| } | |
| /* 5. Feedback bar: a tidy footer; buttons hug the question, the reassurance note | |
| sits on the right (theme-tuned warm surface + border). */ | |
| .ab-feedback-bar { | |
| background: var(--ab-soft) !important; | |
| border: 1px solid var(--ab-soft-line) !important; | |
| border-radius: 12px !important; padding: 14px 16px !important; | |
| margin-top: 10px !important; | |
| } | |
| /* gr.Group wraps content in a `.styler` (and a nested group) that paints the slate | |
| block surface OVER our warm fill — make those transparent so --ab-soft shows. */ | |
| .ab-feedback-bar .styler { background: transparent !important; } | |
| .ab-feedback-bar .gr-group { | |
| border: none !important; background: transparent !important; | |
| } | |
| .ab-feedback-row { | |
| align-items: center !important; gap: 10px !important; flex-wrap: wrap; | |
| } | |
| /* Gradio wraps each Markdown/Button in a width:100% block, which would make the | |
| label fill the row and wrap the buttons below (and collapse the note to 0). Force | |
| the label + note to size to their own content so the row reads: label · 👍 · 👎 … | |
| note(right). */ | |
| .ab-feedback-row > .ab-fb-label, | |
| .ab-feedback-row > .ab-fb-note { | |
| flex: 0 0 auto !important; width: auto !important; min-width: 0 !important; | |
| } | |
| .ab-feedback-row > .ab-fb-label p, .ab-feedback-row > .ab-fb-note p { | |
| margin: 0 !important; white-space: nowrap; | |
| } | |
| .ab-feedback-row > button { flex: 0 0 auto !important; } | |
| .ab-feedback-row > .ab-fb-note { | |
| margin-left: auto !important; font-size: 12px !important; | |
| color: var(--body-text-color-subdued) !important; | |
| } | |
| .ab-feedback-bar button.secondary { | |
| background: var(--block-background-fill) !important; | |
| border: 1px solid var(--ab-soft-line) !important; border-radius: 9px !important; | |
| } | |
| /* Hide the chatbot's per-message utility icons (copy/share/trash) in the trace. */ | |
| .ab-trace .icon-button-wrapper, .ab-trace .copy-button { display: none !important; } | |
| /* "Why a tiny model is enough": full-width boxed accordion, aligned with | |
| "Advanced — choose the model" (user request), opening an always-DARK card. */ | |
| #why-tiny { margin: 12px 0 0 !important; } | |
| #why-tiny .label-wrap + div { border: none !important; padding: 10px 0 0 !important; } | |
| .ab-why-card { background: #161b24; border-radius: 12px; padding: 18px 22px; } | |
| .ab-why-card .eyebrow { | |
| color: #ee9a6c; text-transform: uppercase; font-size: 12px; font-weight: 700; | |
| letter-spacing: 0.05em; margin-bottom: 8px; | |
| } | |
| .ab-why-card p { color: #d6dae0; font-size: 14.5px; line-height: 1.6; margin: 0; } | |
| .ab-why-card b { color: #ffffff; } | |
| .ab-why-card h3 { color: #fff; font-size: 18px; margin: 2px 0 8px; font-weight: 650; } | |
| .ab-why-card .ab-proof { color: #eef1f4; font-size: 15px; margin-top: 12px; | |
| border-left: 3px solid #f6c453; padding-left: 12px; } | |
| .ab-why-card .ab-wip { color: #9aa3ad; font-size: 13px; margin-top: 12px; | |
| font-style: italic; } | |
| /* Head-to-head: off-the-shelf vs fine-tuned (the judge proof) */ | |
| .ab-h2h { margin-top: 18px; border-top: 1px solid #2c3038; padding-top: 14px; } | |
| .ab-h2h-intro { color: #c8ccd2; font-size: 13.5px; margin: 0 0 10px !important; } | |
| .ab-h2h-q { color: #fff; font-weight: 600; font-size: 14.5px; margin-bottom: 8px; } | |
| .ab-h2h-q .ab-gold { color: #7ee0a2; font-weight: 500; font-size: 13px; } | |
| .ab-h2h-cols { display: flex; gap: 10px; flex-wrap: wrap; } | |
| .ab-col { flex: 1 1 240px; background: #13161c; border-radius: 8px; | |
| padding: 10px 12px; border: 1px solid #2c3038; } | |
| .ab-col.ab-base { border-color: #5b2b30; } | |
| .ab-col.ab-v2 { border-color: #27513a; } | |
| .ab-col-h { font-weight: 650; font-size: 12.5px; margin-bottom: 6px; color: #e6e9ee; | |
| text-transform: uppercase; letter-spacing: .03em; } | |
| .ab-step { font-size: 11.5px; color: #aeb4bd; margin: 2px 0; display: flex; | |
| gap: 6px; align-items: baseline; } | |
| .ab-step .act { color: #8ea0b6; font-weight: 600; min-width: 62px; } | |
| .ab-step code { color: #d6dae0; background: #0e1014; padding: 0 4px; | |
| border-radius: 4px; font-size: 11px; } | |
| .ab-step .res { color: #6f7884; font-style: italic; } | |
| .ab-final { margin-top: 8px; font-size: 13px; color: #e6e9ee; } | |
| .ab-col.ab-base .ab-final b { color: #ff8f8f; } | |
| .ab-col.ab-v2 .ab-final b { color: #7ee0a2; } | |
| .ab-h2h-more { margin-top: 14px; } | |
| .ab-h2h-more ul { margin: 6px 0 0; padding-left: 18px; } | |
| .ab-h2h-more li { color: #c8ccd2; font-size: 13px; margin: 4px 0; } | |
| .ab-x { color: #ff8f8f; } | |
| .ab-ok { color: #7ee0a2; font-weight: 600; } | |
| /* comfortable touch targets for the "run live" chips (WCAG 2.5.5) */ | |
| #why-tiny .ab-starter-row button { min-height: 38px; } | |
| .ab-footer-note { margin-top: 26px !important; } | |
| /* SQL block: keep gradio's native code theme (readable in light + dark). */ | |
| .gradio-container .cm-editor { font-size: 13px; border-radius: 9px; } | |
| """ | |
| # The animated working indicator (shown in a gr.HTML while an ask streams). The CSS | |
| # spinner above does the spinning — no static glyph. | |
| _SPINNER_HTML = ( | |
| '<div class="ab-status" role="status" aria-live="polite">' | |
| '<span class="ab-spinner"></span> Reading your data…</div>' | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Default deterministic demo-script policy (NO model). Recognizes the 3 example | |
| # decision-questions and walks a fixed, dataset-appropriate plan against the | |
| # committed retail_smb DB — DESCRIBE -> QUERY(s) -> ANSWER — with ONE question | |
| # (the revenue-each-month one) scripted to hit a wrong-column error and then | |
| # self-correct, so the no-model demo still shows the P0 "SQL on error + recovery" | |
| # beat (Primary Journey step 4 / demo script beat 3). For an arbitrary TYPED | |
| # (non-example) question it falls back to an HONEST, clearly-labeled placeholder | |
| # turn (DESCRIBE a table + a simple count) — it does NOT pretend to answer | |
| # questions it wasn't scripted for. Pure Policy (select_action only); the real | |
| # ModelPolicy still swaps in at F006 via build_demo(policy_factory=...). | |
| # --------------------------------------------------------------------------- | |
| # Each scripted step is an (action_type, argument) pair. A QUERY whose SQL | |
| # references a non-existent column produces an env error step (status="error"), | |
| # which is exactly the self-correction beat we want to show for question 2. | |
| _DemoStep = tuple[str, str] | |
| # Scripts keyed by the normalized example question. Each script ends in ANSWER. | |
| # Every ANSWER's stated number/name equals the real gold result on retail_smb | |
| # (computed against the committed DB, not guessed). Chart blocks use bar/line only; | |
| # x/y are column names the QUERY actually returns. | |
| _DEMO_SCRIPTS: dict[str, list[_DemoStep]] = { | |
| "which store has the most orders?": [ | |
| ("DESCRIBE", "orders"), | |
| ( | |
| "QUERY", | |
| "SELECT s.name AS store, COUNT(*) AS orders " | |
| "FROM orders o JOIN stores s ON o.store_id = s.store_id " | |
| "GROUP BY s.name ORDER BY orders DESC", | |
| ), | |
| ( | |
| "ANSWER", | |
| "Maple Street Pets is your busiest store with 60 orders — well ahead " | |
| "of Old Town Pet & Feed (49) and Harbourside Grooming (41). That is " | |
| "where an extra weekend hire would help most.\n" | |
| '```chart\n{"chart_type": "bar", "x": "store", "y": "orders"}\n```', | |
| ), | |
| ], | |
| "what was my revenue each month?": [ | |
| ("DESCRIBE", "orders"), | |
| # Intentional mistake: `total` is not a column (the real one is `amount`). | |
| # This yields a visible error step + the failed SQL (P0), then the next | |
| # step recovers with `amount` AND `WHERE status = 1` (refunds excluded) — | |
| # the demo's self-correction beat plus the refund-aware revenue beat. | |
| ( | |
| "QUERY", | |
| "SELECT strftime('%Y-%m', order_date) AS month, SUM(total) AS revenue " | |
| "FROM orders WHERE status = 1 GROUP BY month ORDER BY month", | |
| ), | |
| ( | |
| "QUERY", | |
| "SELECT strftime('%Y-%m', order_date) AS month, " | |
| "ROUND(SUM(amount), 2) AS revenue " | |
| "FROM orders WHERE status = 1 GROUP BY month ORDER BY month", | |
| ), | |
| ( | |
| "ANSWER", | |
| "Paid-order revenue ran from about £166 in your quietest month " | |
| "(September) up to roughly £828 in your best (June), starting the year " | |
| "at £558.91 in January and finishing at £583.47 in December — about " | |
| "£6,552 across the year (refunds excluded).\n" | |
| '```chart\n{"chart_type": "line", "x": "month", "y": "revenue"}\n```', | |
| ), | |
| ], | |
| "which product makes the most revenue?": [ | |
| ("DESCRIBE", "order_items"), | |
| ( | |
| "QUERY", | |
| "SELECT p.name AS product, ROUND(SUM(oi.line_amount), 2) AS revenue " | |
| "FROM order_items oi JOIN products p ON oi.product_id = p.product_id " | |
| "GROUP BY p.name ORDER BY revenue DESC", | |
| ), | |
| ( | |
| "ANSWER", | |
| "Grain-Free Dog Food 5kg is your top earner at £1,168.50, ahead of the " | |
| "Reflective Lead (£910.00) and the Fleece Pet Blanket (£819.59). Keep " | |
| "it well stocked.\n" | |
| '```chart\n{"chart_type": "bar", "x": "product", "y": "revenue"}\n```', | |
| ), | |
| ], | |
| } | |
| _GENERIC_FALLBACK_ANSWER = "See the result table above." | |
| def _any_query_succeeded(observation: SQLObservation) -> bool: | |
| """True if any prior QUERY in this episode ran without error. | |
| Reads ``observation.action_history`` (the env logs each step as e.g. | |
| ``"QUERY -> ERROR: ..."`` on failure or ``"QUERY -> <result preview>"`` on | |
| success). A scripted plan run against the wrong dataset produces only | |
| ``QUERY -> ERROR`` lines; the retail happy path always lands at least one | |
| successful QUERY (the recovery query on the revenue script). This lets the | |
| policy refuse to emit a scripted ANSWER unless the plan actually executed on | |
| the active DB. | |
| """ | |
| for entry in observation.action_history: | |
| text = entry.strip() | |
| if text.startswith("QUERY ->") and "QUERY -> ERROR" not in text: | |
| return True | |
| return False | |
| class _DemoScriptPolicy: | |
| """Deterministic, model-free policy driven by a per-question script. | |
| Reads ``observation.question`` (the env surfaces the user's question there) | |
| and, when it matches one of the 3 example decision-questions AND the ACTIVE | |
| dataset IS the retail demo DB, walks that question's scripted | |
| ``DESCRIBE -> QUERY(s) -> ANSWER`` plan against the demo dataset. The | |
| revenue script includes a wrong-column QUERY so the env returns an error | |
| step (the failed SQL is then shown), followed by a recovery QUERY — the | |
| visible self-correction beat. For any other (typed) question — OR an example | |
| question asked against a DIFFERENT (e.g. uploaded) dataset — it falls back to | |
| an honest, labeled placeholder turn rather than fabricating a retail answer. | |
| Deterministic and instant; no torch/transformers. The real ``ModelPolicy`` | |
| swaps in at F006. | |
| The scripted DESCRIBE/QUERY/ANSWER plans hard-code retail table and column | |
| names, so they only make sense on the retail demo DB. ``active_db_id`` carries | |
| which dataset the current ask runs against (``_handle_ask`` sets it from | |
| ``db_state["db_id"]``); when it does not equal ``demo_db_id`` the scripted | |
| plan is never used. As belt-and-suspenders (and to cover any future db_id | |
| that happens to match by name but not by schema), a scripted ANSWER is only | |
| emitted once at least one QUERY has actually SUCCEEDED on the active DB; if | |
| every scripted QUERY errored, the honest fallback answer is returned instead. | |
| """ | |
| def __init__( | |
| self, | |
| *, | |
| demo_db_id: str = _DEMO_DB_ID, | |
| active_db_id: str = _DEMO_DB_ID, | |
| ) -> None: | |
| self._plan_index = 0 | |
| self._demo_db_id = demo_db_id | |
| # Which dataset the current ask runs against. Defaults to the demo DB so | |
| # the policy scripts the retail examples out of the box (the shipped | |
| # default + every direct-construction test). ``_handle_ask`` overrides | |
| # this per-ask from ``db_state["db_id"]`` so an example chip clicked over | |
| # an uploaded dataset falls through to the honest path. | |
| self.active_db_id = active_db_id | |
| def select_action(self, observation: SQLObservation) -> SQLAction: | |
| plan = self._plan_for(observation) | |
| # Answer once budget is nearly spent regardless of plan position, so a | |
| # script never runs the episode out of steps without a final answer. | |
| if observation.budget_remaining <= 1: | |
| return SQLAction( | |
| action_type="ANSWER", argument=self._fallback_answer(observation, plan) | |
| ) | |
| step = self._plan_index | |
| self._plan_index += 1 | |
| if step < len(plan): | |
| action_type, argument = plan[step] | |
| if action_type == "ANSWER": | |
| # Only emit a scripted ANSWER if a QUERY actually succeeded on the | |
| # active DB — otherwise the plan ran against the wrong dataset and | |
| # the scripted literal would contradict the empty table. | |
| return SQLAction( | |
| action_type="ANSWER", | |
| argument=self._answer_for(observation, argument), | |
| ) | |
| return SQLAction(action_type=action_type, argument=argument) | |
| return SQLAction( | |
| action_type="ANSWER", argument=self._fallback_answer(observation, plan) | |
| ) | |
| def _plan_for(self, observation: SQLObservation) -> list[_DemoStep]: | |
| """The scripted plan for this question, or an honest fallback plan. | |
| Scripts are retail-schema-specific, so they are ONLY used when the active | |
| dataset is the retail demo DB; on any other dataset (an uploaded CSV) the | |
| same example question falls through to the honest placeholder plan. | |
| """ | |
| if self.active_db_id != self._demo_db_id: | |
| return self._fallback_plan(observation) | |
| key = observation.question.strip().lower() | |
| scripted = _DEMO_SCRIPTS.get(key) | |
| if scripted is not None: | |
| return scripted | |
| return self._fallback_plan(observation) | |
| def _answer_for(self, observation: SQLObservation, scripted_answer: str) -> str: | |
| """The scripted ANSWER, but only if a QUERY actually succeeded. | |
| Belt-and-suspenders against a scripted plan whose QUERYs all errored on | |
| the active DB (the script's tables/columns don't exist there): emitting | |
| the scripted literal then would contradict the empty result table. When no | |
| QUERY succeeded, return the honest generic note instead. | |
| """ | |
| if _any_query_succeeded(observation): | |
| return scripted_answer | |
| return _GENERIC_FALLBACK_ANSWER | |
| def _fallback_plan(self, observation: SQLObservation) -> list[_DemoStep]: | |
| """Honest placeholder turn for a non-example typed question. | |
| Looks at the first available table and counts its rows, then answers with | |
| a LABELED disclaimer — this is a no-model stub, so it never pretends to | |
| have answered an arbitrary question. | |
| """ | |
| table = self._first_table(observation.schema_info) | |
| return [ | |
| ("DESCRIBE", table), | |
| ("QUERY", f"SELECT COUNT(*) AS n FROM {quote_ident(table)}"), | |
| ( | |
| "ANSWER", | |
| "Demo mode (no model loaded): I can't answer free-form questions " | |
| f"here yet — but `{table}` has the row count shown above. Try one " | |
| "of the example questions to see a full answer.", | |
| ), | |
| ] | |
| def _first_table(schema_info: str) -> str: | |
| for line in schema_info.splitlines(): | |
| stripped = line.strip() | |
| if not stripped.startswith("- "): | |
| continue | |
| candidate = stripped[2:] | |
| if ":" in candidate: | |
| candidate = candidate.split(":", maxsplit=1)[0] | |
| candidate = candidate.strip() | |
| if candidate: | |
| return candidate | |
| return "data" | |
| def _fallback_answer(observation: SQLObservation, plan: list[_DemoStep]) -> str: | |
| """The plan's final ANSWER argument, or a generic 'see the table' note. | |
| Budget/overrun path. A scripted ANSWER literal is only safe to surface | |
| here once a QUERY actually SUCCEEDED on the active DB (FIX 1b: otherwise | |
| the retail literal could leak via budget exhaustion when the script ran | |
| against the wrong dataset). When no QUERY succeeded, fall back to the | |
| honest generic note rather than the scripted prose. | |
| """ | |
| if _any_query_succeeded(observation): | |
| for action_type, argument in reversed(plan): | |
| if action_type == "ANSWER": | |
| return argument | |
| return observation.result.strip() or _GENERIC_FALLBACK_ANSWER | |
| def _default_policy_factory() -> Policy: | |
| """The default policy factory: a fresh deterministic demo-script policy.""" | |
| return _DemoScriptPolicy() | |
| # --------------------------------------------------------------------------- | |
| # Chat-history normalization — gradio 6.x preprocesses prior turns to list[dict] | |
| # (the messages Chatbot), but this handler appends gr.ChatMessage objects. To | |
| # keep turn 2+ internally consistent (C3) we coerce the WHOLE incoming history to | |
| # a uniform list[gr.ChatMessage] before appending, so the list never mixes types. | |
| # --------------------------------------------------------------------------- | |
| def _coerce_history(chat_history: list) -> list[gr.ChatMessage]: | |
| """Normalize a Chatbot history to a uniform ``list[gr.ChatMessage]``. | |
| Gradio 6.x preprocesses prior turns into ``list[dict]`` (``{"role","content", | |
| "metadata"}``) while this handler appends ``gr.ChatMessage`` objects; without | |
| normalization, turn 2 would carry a mixed ``[dict, ..., ChatMessage]`` list | |
| that only renders by serializer tolerance. Coercing every entry to | |
| ``gr.ChatMessage`` keeps the list one type end to end (C3). | |
| """ | |
| normalized: list[gr.ChatMessage] = [] | |
| for entry in chat_history: | |
| if isinstance(entry, gr.ChatMessage): | |
| normalized.append(entry) | |
| continue | |
| if isinstance(entry, dict): | |
| normalized.append( | |
| gr.ChatMessage( | |
| role=str(entry.get("role", "assistant")), | |
| content=entry.get("content", ""), | |
| metadata=dict(entry.get("metadata") or {}), | |
| ) | |
| ) | |
| continue | |
| # Unknown shape — render it as plain assistant text rather than drop it. | |
| normalized.append(gr.ChatMessage(role="assistant", content=str(entry))) | |
| return normalized | |
| # Terminal filler messages the agent loop appends after the real answer — skipped | |
| # when extracting "the model's answer" for the feedback record. | |
| _FILLER_ANSWERS = {"task complete.", "done.", "ended session.", ""} | |
| def _content_text(content: object) -> str: | |
| """Flatten a gradio message ``content`` (str, or 6.x list-of-parts) to text. | |
| Gradio 6.x can deserialize a message's ``content`` as a LIST of content parts | |
| (e.g. ``[{"type": "text", "text": "..."}]``) rather than a plain string, so | |
| calling string ops on it directly raised ``'list' object has no attribute | |
| 'strip'``. This collapses any shape to a plain string. | |
| """ | |
| if isinstance(content, str): | |
| return content | |
| if isinstance(content, list): | |
| parts: list[str] = [] | |
| for part in content: | |
| if isinstance(part, str): | |
| parts.append(part) | |
| elif isinstance(part, dict): | |
| parts.append(str(part.get("text") or part.get("value") or "")) | |
| else: | |
| parts.append(str(getattr(part, "text", "") or "")) | |
| return "".join(parts) | |
| return "" if content is None else str(content) | |
| def _last_answer_text(history: list) -> str: | |
| """The model's final answer text from a chat history (for the feedback record). | |
| Returns the last substantive assistant message (skipping terminal filler like | |
| 'Task complete.'), with surrounding markdown bold stripped. Best-effort context | |
| for the training candidate — never raises. | |
| """ | |
| for msg in reversed(_coerce_history(history)): | |
| if getattr(msg, "role", None) != "assistant": | |
| continue | |
| content = _content_text(getattr(msg, "content", "")).strip() | |
| if content.lower() not in _FILLER_ANSWERS: | |
| return content.strip("*").strip() | |
| return "" | |
| # --------------------------------------------------------------------------- | |
| # Step -> chat message mapper (the only gradio-coupling of the loop; pure). | |
| # --------------------------------------------------------------------------- | |
| # Plain-language titles per action type — NO model/RL/agent jargon (Content | |
| # Contract). DESCRIBE/SAMPLE speak about looking at the data, not "exploring". | |
| _DONE_TITLES = { | |
| "DESCRIBE": "Looked at the data", | |
| "SAMPLE": "Looked at a few rows", | |
| "QUERY": "Ran a query", | |
| "ANSWER": "Answer", | |
| } | |
| def _step_to_chat_message( | |
| step: Step, | |
| *, | |
| step_number: int | None = None, | |
| pending: bool = False, | |
| will_retry: bool = True, | |
| ) -> gr.ChatMessage: | |
| """Map ONE ``Step`` to ONE ``gr.ChatMessage`` accordion (pure on its inputs). | |
| Produces ``role="assistant"`` and a plain-language ``metadata["title"]`` (no | |
| model/RL jargon), prefixed with the 1-based ``step_number`` ("1 · …") so a long | |
| trace is easy to follow. The title names the table for DESCRIBE/SAMPLE so repeated | |
| look-ups are distinguishable, and the content shows WHAT the step returned (so the | |
| user can inspect it). ``metadata["status"]`` is ``"pending"`` when ``pending``, | |
| else ``"done"`` (with ``metadata["duration"]``). The error title only says | |
| "retrying" when a retry actually follows (``will_retry``); else neutral "Query | |
| failed" (N3). | |
| """ | |
| if step.status == "error": | |
| base = "Query failed — retrying" if will_retry else "Query failed" | |
| content = _error_content(step) | |
| else: | |
| base = _done_title(step) | |
| content = _done_content(step) | |
| title = f"{step_number} · {base}" if step_number else base | |
| metadata: dict[str, object] = {"title": title} | |
| if pending: | |
| metadata["status"] = "pending" | |
| else: | |
| metadata["status"] = "done" | |
| metadata["duration"] = round(step.duration_s, 2) | |
| return gr.ChatMessage(role="assistant", content=content, metadata=metadata) | |
| _RESULT_PREVIEW_CHARS = 1200 | |
| def _truncate(text: str, limit: int = _RESULT_PREVIEW_CHARS) -> str: | |
| """Cap long result text for the inline step preview.""" | |
| text = (text or "").strip() | |
| if len(text) <= limit: | |
| return text | |
| return text[:limit].rstrip() + "\n… (truncated)" | |
| def _done_title(step: Step) -> str: | |
| """Informative collapsed-trace title — names the table so repeated DESCRIBE | |
| steps are distinguishable ("Looked at `orders`", not just "Looked at the data").""" | |
| if step.action_type == "DESCRIBE": | |
| return f"Looked at `{step.argument}`" | |
| if step.action_type == "SAMPLE": | |
| return f"Sampled `{step.argument}`" | |
| return _DONE_TITLES.get(step.action_type, "Step") | |
| def _done_content(step: Step) -> str: | |
| """Human-readable detail for a successful step — including WHAT it returned.""" | |
| if step.action_type == "QUERY": | |
| body = f"The query it ran:\n```sql\n{step.argument}\n```" | |
| if step.result_text: | |
| body += f"\n\nWhat came back:\n```\n{_truncate(step.result_text)}\n```" | |
| return body | |
| if step.action_type == "ANSWER": | |
| return f"**{step.argument}**" | |
| if step.action_type in ("DESCRIBE", "SAMPLE"): | |
| body = f"Looked at `{step.argument}`." | |
| if step.result_text: | |
| body += f"\n\n```\n{_truncate(step.result_text)}\n```" | |
| return body | |
| return step.result_text or "" | |
| def _error_content(step: Step) -> str: | |
| """Human-readable detail for a failed step — always shows the failed SQL.""" | |
| detail = step.error or "That query didn't work." | |
| return f"{detail}\n\nThe query it tried:\n```sql\n{step.argument}\n```" | |
| # --------------------------------------------------------------------------- | |
| # Provenance ("show your work") — one quiet PLAIN-LANGUAGE line under the answer. | |
| # It paraphrases the table/filter of the final SELECT rather than repeating the | |
| # raw SQL (which already lives in the Code panel) — Q1. | |
| # --------------------------------------------------------------------------- | |
| _FROM_PATTERN = re.compile( | |
| r"\bFROM\s+([`\"\[]?[A-Za-z_][A-Za-z0-9_]*[`\"\]]?)", re.IGNORECASE | |
| ) | |
| _WHERE_PATTERN = re.compile( | |
| r"\bWHERE\s+(.+?)(?:\bGROUP\s+BY\b|\bORDER\s+BY\b|\bLIMIT\b|;|$)", | |
| re.IGNORECASE | re.DOTALL, | |
| ) | |
| def _strip_identifier_quotes(identifier: str) -> str: | |
| """Strip surrounding SQL identifier quotes (`` ` ``, ``"``, ``[]``).""" | |
| cleaned = identifier.strip() | |
| if len(cleaned) >= 2 and cleaned[0] in '`"[' and cleaned[-1] in '`"]': | |
| return cleaned[1:-1] | |
| return cleaned | |
| def _provenance_line(sql: str, *, row_count: int | None = None) -> str: | |
| """A quiet one-line PLAIN-LANGUAGE 'show your work' note (Q1). | |
| Paraphrases the primary table and WHERE filter of the final SELECT (e.g. | |
| "Looked at `orders`, filtered where `status = 1`, returned 4 rows") instead of | |
| repeating the raw SQL — that already lives in the Code panel. Falls back to a | |
| generic line when the SQL can't be parsed. | |
| """ | |
| stripped = sql.strip() | |
| if not stripped: | |
| return _EMPTY_PROVENANCE | |
| from_match = _FROM_PATTERN.search(stripped) | |
| if from_match is None: | |
| return "_Show your work: ran a read-only query on your data._" | |
| table = _strip_identifier_quotes(from_match.group(1)) | |
| parts = [f"looked at `{table}`"] | |
| where_match = _WHERE_PATTERN.search(stripped) | |
| if where_match is not None: | |
| condition = " ".join(where_match.group(1).split()) | |
| if condition: | |
| parts.append(f"filtered where `{condition}`") | |
| if row_count is not None: | |
| plural = "row" if row_count == 1 else "rows" | |
| parts.append(f"returned {row_count} {plural}") | |
| body = ", ".join(parts) | |
| return f"_Show your work: {body} (read-only)._" | |
| # --------------------------------------------------------------------------- | |
| # _handle_ask — the streaming generator behind 'Ask your data' / example chips. | |
| # --------------------------------------------------------------------------- | |
| def _empty_table() -> dict: | |
| """An empty gr.Dataframe payload (no rows, no headers).""" | |
| return {"headers": [], "data": []} | |
| def _table_payload(rows: list[tuple], columns: list[str]) -> dict: | |
| """A gr.Dataframe payload from re-executed rows + real headers.""" | |
| return {"headers": list(columns), "data": [list(row) for row in rows]} | |
| # Chart styling — the app UI is dark, so a default white vega-lite chart looks out | |
| # of place. We render on a TRANSPARENT background with light axis/label/title | |
| # colours and a single indigo accent for the bars/line/markers (Visual Direction: | |
| # "a single accent"). Many-category x axes stay readable via vega-lite's own | |
| # label handling: a -45° label angle plus a labelLimit that truncates long labels | |
| # with a client-side ellipsis — no manual thinning/truncation needed. | |
| # Warm-light chart palette (v3 design): rust marks on a white card with quiet axes. | |
| # Best practice for "which is biggest?" charts: highlight the focal (max) bar in a | |
| # saturated colour and mute the rest, so the eye lands on the answer (the designer's | |
| # two-tone rust). The exact answer values live in the table beside the chart. | |
| _CHART_ACCENT = "#bf3e11" # rust — the leader bar / line / point | |
| _CHART_ACCENT_MUTED = "#eaa97f" # light rust — the runner-up bars | |
| _CHART_FG = "#8a93a0" # mid gray — legible on both light and dark surfaces | |
| _CHART_GRID = "#eceef1" # light grid lines | |
| _CHART_DOMAIN = "#d1d5db" # gray-300 (axis domain line) | |
| _CHART_LABEL_LIMIT = 90 # px budget before vega-lite ellipsis-truncates a label | |
| # Many-category bar charts (e.g. 20 products) render as a forest of unreadable | |
| # slivers. Cap the BAR chart to the top-N by value so the eye lands on the leaders; | |
| # the table beside it still carries every row. 6 dovetails with the value-label | |
| # threshold below, so the capped bars also get their numbers. | |
| _CHART_MAX_BARS = 6 | |
| # Bar x-axis labels (category names like "Grain-Free Dog Food 5kg") don't fit under a | |
| # narrow bar, so abbreviate the DISPLAYED tick to its first N chars (digits too) with a | |
| # trailing ellipsis, angled for density — the FULL label stays on hover (tooltip) and in | |
| # the table. Abbreviating only the tick (via vega labelExpr), NOT the x field, keeps two | |
| # names that share a prefix as distinct bars. | |
| _CHART_LABEL_ABBR = 4 | |
| def _render_chart( | |
| spec: ChartSpec | None, | |
| rows: list[tuple], | |
| columns: list[str], | |
| ) -> "alt.Chart | None": | |
| """Render a chart ``ChartSpec`` to an interactive Altair (vega-lite) chart. | |
| Returns ``None`` — meaning "show the table only, no chart" — when ``spec`` is | |
| None, ``rows`` is empty, or the resolved ``spec.y`` column can't be coerced to | |
| numeric (a chart of a non-numeric y axis is degenerate; the column-presence | |
| validation already happened upstream in ``validate_intent`` — dtype is enforced | |
| only here, at render time, where the DataFrame exists). | |
| Otherwise it builds an Altair chart for ``spec.chart_type`` (bar/line/scatter) | |
| with a per-point ``tooltip`` encoding — that hover interactivity is the whole | |
| point of using vega-lite over a static image. The x axis is treated as | |
| nominal/ordinal so string x values (product names, "2024-01" months) plot | |
| correctly; a ``-45°`` label angle plus a ``labelLimit`` keep many-category axes | |
| readable (vega-lite truncates long labels with a client-side ellipsis — no | |
| manual thinning needed). Styled for the dark app UI: a transparent background | |
| with light axis colours and a single indigo accent (or ``spec.color`` when set). | |
| Rendering through ONE ``gr.Plot`` (fed this single chart) — rather than the | |
| 3 toggled vega ``gr.*Plot`` of the original seam — is what fixes the multi-turn | |
| crash: hiding a previously-visible plot on a type switch threw a | |
| ``ResizeObserver`` ``offsetWidth`` error and aborted the new chart. A single | |
| component is never hidden-then-reshown mid-stream, so hover is restored WITHOUT | |
| reintroducing the disappear bug. | |
| """ | |
| if spec is None or not rows: | |
| return None | |
| df = pd.DataFrame([list(row) for row in rows], columns=list(columns)) | |
| # Hide rather than draw a degenerate chart when y can't be plotted numerically. | |
| if pd.to_numeric(df[spec.y], errors="coerce").isna().all(): | |
| return None | |
| use_color = spec.color is not None and spec.color in columns | |
| tooltip = [spec.x, spec.y] + ([spec.color] if use_color else []) | |
| # Minimal axes: horizontal category labels (angled ones read as clutter), long | |
| # names ellipsis-truncate and keep full text in the tooltip; no ticks/domain. | |
| x_axis = alt.Axis( | |
| labelAngle=0, labelLimit=_CHART_LABEL_LIMIT, labelPadding=8, | |
| ticks=False, domain=False, | |
| ) | |
| if spec.chart_type == "line": | |
| x = alt.X(f"{spec.x}:N", title=None, axis=x_axis) | |
| # Lines need a y scale to read; keep it quiet (no domain/ticks, faint grid). | |
| y = alt.Y( | |
| f"{spec.y}:Q", | |
| title=None, | |
| axis=alt.Axis(grid=True, ticks=False, domain=False, labelPadding=6), | |
| ) | |
| enc: dict = {"x": x, "y": y, "tooltip": tooltip} | |
| if use_color: | |
| enc["color"] = alt.Color(spec.color) | |
| chart = alt.Chart(df).mark_line( | |
| color=_CHART_ACCENT, | |
| strokeWidth=2.5, | |
| point=alt.OverlayMarkDef(color=_CHART_ACCENT, size=55), | |
| ).encode(**enc) | |
| elif spec.chart_type == "scatter": | |
| x = alt.X(f"{spec.x}:N", sort="-y", title=None, axis=x_axis) | |
| y = alt.Y(f"{spec.y}:Q", title=None, axis=alt.Axis(ticks=False, domain=False)) | |
| enc = {"x": x, "y": y, "tooltip": tooltip} | |
| if use_color: | |
| enc["color"] = alt.Color(spec.color) | |
| chart = alt.Chart(df).mark_point( | |
| filled=True, color=_CHART_ACCENT, size=90 | |
| ).encode(**enc) | |
| else: # bar — minimal: no axes/grid, value labels on bars, the leader in rust | |
| # Cap to the top-N categories by value (the table still shows every row), so a | |
| # 20-category result is a clean ranking of leaders, not a forest of slivers. | |
| if len(df) > _CHART_MAX_BARS: | |
| order = pd.to_numeric(df[spec.y], errors="coerce") | |
| df = ( | |
| df.assign(_ab_order=order) | |
| .sort_values("_ab_order", ascending=False) | |
| .drop(columns="_ab_order") | |
| .head(_CHART_MAX_BARS) | |
| ) | |
| # Angled, abbreviated tick labels (first N chars + "…"); full name on hover. | |
| n = _CHART_LABEL_ABBR | |
| bar_x_axis = alt.Axis( | |
| labelAngle=-40, labelPadding=6, ticks=False, domain=False, | |
| labelExpr=( | |
| f"length(datum.label) > {n} ? slice(datum.label, 0, {n}) + '…' " | |
| ": datum.label" | |
| ), | |
| ) | |
| x = alt.X(f"{spec.x}:N", sort="-y", title=None, axis=bar_x_axis) | |
| y = alt.Y(f"{spec.y}:Q", title=None, axis=None) # values shown on the bars | |
| if use_color: | |
| color = alt.Color(spec.color) | |
| else: | |
| # Two-tone highlight: the max bar saturated, the rest muted (best practice). | |
| max_val = float(pd.to_numeric(df[spec.y], errors="coerce").max()) | |
| color = alt.condition( | |
| alt.datum[spec.y] >= max_val, | |
| alt.value(_CHART_ACCENT), | |
| alt.value(_CHART_ACCENT_MUTED), | |
| ) | |
| base = alt.Chart(df).encode(x=x, tooltip=tooltip) | |
| bars = base.mark_bar( | |
| cornerRadiusTopLeft=5, cornerRadiusTopRight=5 | |
| ).encode(y=y, color=color) | |
| # Value labels above each bar make the insight readable without the table — | |
| # but ONLY for a few bars; with many categories they overlap into mush (the | |
| # table carries the exact numbers then). | |
| if len(df) <= _CHART_MAX_BARS: | |
| labels = base.mark_text( | |
| dy=-7, fontSize=12, fontWeight="bold", color="#8a93a0" | |
| ).encode(y=y, text=alt.Text(f"{spec.y}:Q", format="~s")) | |
| chart = bars + labels | |
| else: | |
| chart = bars | |
| chart = ( | |
| # Compact + fixed: gradio's gr.Plot container does not propagate a resolved | |
| # width to vega (width="container" collapses to a sliver), so size it to fit | |
| # the answer card's chart column with room to spare. | |
| chart.properties(height=170, width=300) | |
| .configure(background="transparent", font="IBM Plex Sans, system-ui") | |
| .configure_axis( | |
| labelColor=_CHART_FG, | |
| titleColor=_CHART_FG, | |
| gridColor=_CHART_GRID, | |
| domainColor=_CHART_DOMAIN, | |
| labelFontSize=12, | |
| ) | |
| .configure_view(stroke=None) | |
| ) | |
| return chart | |
| def _chart_update( | |
| spec: ChartSpec | None, | |
| rows: list[tuple], | |
| columns: list[str], | |
| ) -> dict: | |
| """The single ``gr.update()`` for the one ``chart_plot`` ``gr.Plot``. | |
| Renders the spec to an interactive Altair chart via ``_render_chart``: a chart | |
| → ``gr.update(visible=True, value=chart)``; ``None`` (no spec / empty rows / | |
| non-numeric y) → ``gr.update(visible=False, value=None)`` (table only — no | |
| degenerate empty chart). | |
| """ | |
| chart = _render_chart(spec, rows, columns) | |
| if chart is None: | |
| return _hidden_chart() | |
| return gr.update(visible=True, value=chart) | |
| def _hidden_chart() -> dict: | |
| """The single hidden ``chart_plot`` update (pre-ANSWER / no-valid-intent).""" | |
| return gr.update(visible=False, value=None) | |
| def _apply_step( | |
| step: Step, | |
| history: list[gr.ChatMessage], | |
| table_value: dict, | |
| sql_value: str, | |
| last_sql: str, | |
| last_row_count: int | None, | |
| answered: bool, | |
| *, | |
| will_retry: bool, | |
| step_number: int | None = None, | |
| ) -> tuple[list[gr.ChatMessage], dict, str, str, int | None, bool]: | |
| """Fold ONE ``Step`` into the streaming UI state (pure on its inputs). | |
| Appends the step's chat message and, for a QUERY, pipes the SQL to the panel | |
| and (on success) refreshes the result table from the LAST successful QUERY — | |
| regardless of row count (C1/C2). ``will_retry`` chooses the error title (N3); | |
| ``step_number`` numbers the trace entry. Returns the updated ``(history, table, | |
| sql, last_sql, last_row_count, answered)`` tuple; never mutates its inputs. | |
| """ | |
| history = [ | |
| *history, | |
| _step_to_chat_message(step, step_number=step_number, will_retry=will_retry), | |
| ] | |
| if step.action_type == "QUERY": | |
| # P0: the query it wrote reaches the dedicated panel on success AND error. | |
| sql_value = step.argument | |
| last_sql = step.argument | |
| if step.status == "done": | |
| # C1/C2: update on EVERY successful QUERY, even a 0-row result. An | |
| # empty result renders real headers + zero data rows (honest). | |
| table_value = _table_payload(step.rows, step.columns) | |
| last_row_count = len(step.rows) | |
| if step.action_type == "ANSWER" and step.status == "done": | |
| answered = True | |
| return history, table_value, sql_value, last_sql, last_row_count, answered | |
| def _handle_ask( | |
| question: str, | |
| db_state: dict, | |
| chat_history: list, | |
| policy_factory: Callable[[], Policy], | |
| *, | |
| rollout: Callable[[], Iterable[Step]] | None = None, | |
| ) -> Iterator[tuple]: | |
| """Generator handler for 'Ask your data' / an example chip. | |
| Seeds a NON-GOLD episode (``env.begin_episode``), iterates | |
| ``run_agent_turn``, and after EVERY step yields the growing UI tuple | |
| ``(chat_history, table_value, sql_code_value, provenance_text)`` so the chat | |
| accordions stream and the table/SQL appear as soon as available. | |
| P0: pipes the QUERY argument to ``sql_code_value`` on success AND whenever a | |
| step has ``status="error"`` (the failed query is shown before the retry). | |
| Append-only (ADR 0006): ``chat_history`` only grows. The incoming history is | |
| normalized to a uniform ``list[gr.ChatMessage]`` (C3) so multi-turn "Ask | |
| another" never mixes dicts and ChatMessage objects. | |
| The result table tracks the LAST successful QUERY regardless of row count | |
| (C1/C2): a zero-row final QUERY renders real headers + an empty body, so the | |
| shown SQL and the table never disagree. | |
| F005: on the ANSWER step, ``parse_chart_intent`` strips an optional | |
| `````chart {…}````` block from the answer text (so the user sees clean | |
| prose) and ``validate_intent`` checks it against the LAST successful QUERY's | |
| columns; a valid intent renders an interactive Altair (vega-lite) chart into | |
| the single ``chart_plot`` ``gr.Plot`` fed that QUERY's DataFrame | |
| (``_render_chart``). Absent/invalid/non-numeric-y intent → the single plot | |
| hidden (table only — the F004 behaviour preserved). Rendering through ONE | |
| ``gr.Plot`` (not the 3 toggled vega ``gr.*Plot``) fixes the multi-turn crash | |
| where hiding a previously-visible vega plot on a type switch threw a | |
| ``ResizeObserver`` ``offsetWidth`` error and made the new chart disappear — a | |
| single component is never hidden-then-reshown mid-stream. Using Altair keeps | |
| per-point hover tooltips (the bug was the multiple toggled components, not vega | |
| itself), and a ``labelAngle`` + ``labelLimit`` keep many-category axes readable. | |
| Every yield is a 5-tuple | |
| ``(chatbot, result_table, sql_code, provenance, chart_plot)``. | |
| """ | |
| history = _coerce_history(chat_history) | |
| table_value = _empty_table() | |
| sql_value = "" | |
| last_sql = "" | |
| last_row_count: int | None = None | |
| # F005: the LAST successful QUERY's rows/columns feed the chart + validate the | |
| # intent's x/y against the real result columns. | |
| last_rows: list[tuple] = [] | |
| last_columns: list[str] = [] | |
| answered = False | |
| chart = _hidden_chart() | |
| step_no = 0 # 1-based step counter for the numbered trace titles ("1 ·", "2 ·"). | |
| if not question.strip(): | |
| yield (history, table_value, sql_value, _EMPTY_PROVENANCE, chart) | |
| return | |
| if rollout is not None: | |
| # Real-model path (F006): the whole agentic loop already ran inside ONE | |
| # window — a ``@spaces.GPU`` call on a ZeroGPU Space (one GPU allocation | |
| # per question; ZeroGPU has no cross-call KV cache). The env + ModelPolicy | |
| # are built INSIDE that window from primitives (db_dir/db_id/question) so no | |
| # live SQLite connection crosses the fork, and the resolved ``Step`` list is | |
| # returned. We stream those steps through the SAME UI mapping below — format | |
| # parity is guaranteed because the rollout drives ``ModelPolicy`` (which goes | |
| # through ``server/tooling.py``), never a second parser. | |
| steps_iter: Iterable[Step] = rollout() | |
| else: | |
| env = _build_environment(db_state["db_dir"]) | |
| env.begin_episode(db_state["db_id"], question) | |
| policy = policy_factory() | |
| # Thread the ACTIVE dataset to the demo-script policy so it only walks the | |
| # retail scripts when the retail demo DB is loaded — an example chip clicked | |
| # over an UPLOADED dataset must NOT fabricate the retail answer (it falls | |
| # through to the honest "Demo mode" placeholder). Other (real-model) policies | |
| # ignore this; the attribute set is a no-op for them. | |
| if isinstance(policy, _DemoScriptPolicy): | |
| policy.active_db_id = db_state["db_id"] | |
| steps_iter = run_agent_turn(question, env, policy, budget=_STEP_BUDGET) | |
| def _process(step: Step, *, will_retry: bool) -> dict: | |
| """Fold one step into the closed-over UI state; return the chart update. | |
| Keeps the ANSWER block-strip + chart render in ONE place (the only chart | |
| parse site) and mutates the enclosing ``_handle_ask`` locals so both the | |
| mid-loop and final-step call sites stay thin. | |
| """ | |
| nonlocal history, table_value, sql_value, last_sql, last_row_count | |
| nonlocal last_rows, last_columns, answered, step_no | |
| step_no += 1 | |
| # Track the last successful QUERY's rows/columns for the chart + validator. | |
| if step.action_type == "QUERY" and step.status == "done": | |
| last_rows = list(step.rows) | |
| last_columns = list(step.columns) | |
| # ANSWER: strip the chart block so the chat shows clean prose; validate the | |
| # intent against the last successful QUERY columns. | |
| step_chart = _hidden_chart() | |
| if step.action_type == "ANSWER" and step.status == "done": | |
| clean_text, intent = parse_chart_intent(step.argument) | |
| step = dataclasses.replace(step, argument=clean_text) | |
| spec = validate_intent(intent, last_columns) if intent is not None else None | |
| step_chart = _chart_update(spec, last_rows, last_columns) | |
| history, table_value, sql_value, last_sql, last_row_count, answered = ( | |
| _apply_step( | |
| step, | |
| history, | |
| table_value, | |
| sql_value, | |
| last_sql, | |
| last_row_count, | |
| answered, | |
| will_retry=will_retry, | |
| step_number=step_no, | |
| ) | |
| ) | |
| return step_chart | |
| # One-step lookahead: an error step only says "retrying" when another step | |
| # actually follows it in this turn (N3). We hold the current step until we | |
| # know whether the turn continues, then render it with the correct title. | |
| pending_step: Step | None = None | |
| for next_step in steps_iter: | |
| if pending_step is not None: | |
| chart = _process(pending_step, will_retry=True) | |
| yield ( | |
| history, | |
| table_value, | |
| sql_value, | |
| _provenance_line(sql_value, row_count=last_row_count), | |
| chart, | |
| ) | |
| pending_step = next_step | |
| if pending_step is not None: | |
| # The final step of the turn: no step follows, so no "retrying" suffix. | |
| chart = _process(pending_step, will_retry=False) | |
| yield ( | |
| history, | |
| table_value, | |
| sql_value, | |
| _provenance_line(sql_value, row_count=last_row_count), | |
| chart, | |
| ) | |
| if not answered: | |
| # Budget exhausted / no ANSWER — be honest, keep the last SQL it tried. | |
| # No ANSWER → no intent → the chart stays hidden. | |
| history = [ | |
| *history, | |
| gr.ChatMessage( | |
| role="assistant", | |
| content=( | |
| "I couldn't finish that one — here's the last query I tried " | |
| "(have a look, or rephrase the question)." | |
| ), | |
| metadata={"title": "Couldn't finish", "status": "done"}, | |
| ), | |
| ] | |
| yield ( | |
| history, | |
| table_value, | |
| last_sql, | |
| _provenance_line(last_sql, row_count=last_row_count), | |
| _hidden_chart(), | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # _handle_upload / _handle_accept_card — CSV upload -> editable assumptions card. | |
| # The card is DISPLAY-ONLY (ADR 0007): it is shown + confirmed, never injected | |
| # into get_system_prompt at F004. render_data_context is the deferred F006 seam. | |
| # --------------------------------------------------------------------------- | |
| def _render_card_markdown(card: DataCard, *, editable: bool = True) -> str: | |
| """Render a confirmed/proposed ``DataCard`` as a plain-language summary. | |
| DISPLAY-ONLY: this is the 'here's what I think your data is' orientation beat, | |
| NOT the prompt-injection path (``render_data_context`` stays deferred to F006). | |
| ``editable=False`` drops the "edit, then Accept" line for the read-only demo | |
| accordion (which has no Accept control); the upload flow keeps it (default). | |
| """ | |
| lines = [ | |
| f"**Here's what's in your data** ({card.row_count} rows in `{card.table}`):", | |
| ] | |
| if card.table_description: | |
| lines.append(f"- {card.table_description}") | |
| for column in card.columns: | |
| detail = column.description or column.profile.duckdb_type | |
| lines.append(f"- `{column.profile.safe_name}` — {detail}") | |
| lines.append("") | |
| if editable: | |
| lines.append("_These are best-guess assumptions you can edit, then Accept._") | |
| else: | |
| lines.append("_Best-guess column descriptions (read-only)._") | |
| return "\n".join(lines) | |
| # --------------------------------------------------------------------------- | |
| # Preloaded-dataset orientation card (FR3 — Primary Journey step 2). On app load | |
| # the demo dataset is named + oriented on screen BEFORE the user asks: its name, | |
| # a compact rows/columns profile, and the "here's what I think your data is" | |
| # assumptions. Reuses the F003 profiler (`propose_data_card`) and is | |
| # DISPLAY-ONLY — it is NOT injected into `get_system_prompt` (ADR 0007; the | |
| # trained policy never saw a data-context block — that wiring is the F006 seam). | |
| # --------------------------------------------------------------------------- | |
| # A friendly display name for the shipped demo dataset (the raw db_id is jargon). | |
| _DEMO_DATASET_NAME = "Maria's Pet Shop — orders & inventory (demo dataset)" | |
| def _build_demo_card() -> DataCard | None: | |
| """Profile the preloaded demo dataset into a display-only ``DataCard``. | |
| Runs the deterministic F003 profiler over the committed ``retail_smb`` DB's | |
| ``orders`` table so the app can orient the user ("here's your data") on load. | |
| Returns ``None`` (never raises) if the demo DB is missing, so a build can't | |
| fail on a packaging gap — the card region just stays empty in that case. | |
| """ | |
| db_state = _default_db_state() | |
| db_path = Path(db_state["db_path"]) | |
| if not db_path.exists(): | |
| return None | |
| try: | |
| return propose_data_card( | |
| db_path, | |
| db_id=db_state["db_id"], | |
| table=db_state["table"], | |
| ) | |
| except (ValueError, FileNotFoundError): | |
| return None | |
| def _render_demo_card_markdown(card: DataCard | None) -> str: | |
| """Render the loaded-dataset orientation card (name + profile + assumptions). | |
| DISPLAY-ONLY (FR3): names the demo dataset, shows its row count, and lists the | |
| per-column best-guess assumptions via ``_render_card_markdown``. Falls back to | |
| a quiet line if the demo card couldn't be built (missing DB). | |
| """ | |
| if card is None: | |
| return f"**Loaded dataset:** {_DEMO_DATASET_NAME}." | |
| header = ( | |
| f"**Loaded dataset:** {_DEMO_DATASET_NAME} — " | |
| f"{card.row_count} rows in `{card.table}`, " | |
| f"{len(card.columns)} columns." | |
| ) | |
| return header + "\n\n" + _render_card_markdown(card, editable=False) | |
| def _handle_upload(file_obj, db_state: dict) -> tuple: | |
| """Upload handler: ``ingest_csv(...)`` -> ``propose_data_card(...)``. | |
| Returns ``(new_db_state, proposed_card_or_None, status_text)``. The new | |
| db_state points the next ask at the uploaded DB. Catches | |
| ``ValueError``/``FileNotFoundError`` from ``ingest_csv`` and returns a plain | |
| 'couldn't read that file' message with the db_state UNCHANGED (never crashes). | |
| """ | |
| source = _file_source(file_obj) | |
| if source is None: | |
| return db_state, None, "Couldn't read that file — is it a CSV?" | |
| try: | |
| result = ingest_csv(source, root=str(_UPLOAD_ROOT), if_exists="replace") | |
| card = propose_data_card( | |
| result.db_path, | |
| db_id=result.db_id, | |
| table=result.table, | |
| column_mapping=result.column_mapping, | |
| ) | |
| except (ValueError, FileNotFoundError): | |
| return db_state, None, "Couldn't read that file — is it a CSV?" | |
| new_db_state = { | |
| "db_id": result.db_id, | |
| "db_path": str(result.db_path), | |
| "db_dir": str(_UPLOAD_ROOT), | |
| "table": result.table, | |
| } | |
| status = f"Loaded **{result.row_count} rows** from your file. Ask away below." | |
| return new_db_state, card, status | |
| def _file_source(file_obj) -> str | bytes | None: | |
| """Normalize a gr.File value (path str, a temp-file object, or None).""" | |
| if file_obj is None: | |
| return None | |
| if isinstance(file_obj, str): | |
| return file_obj | |
| name = getattr(file_obj, "name", None) | |
| return name if isinstance(name, str) else None | |
| def _handle_accept_card(card_state: DataCard | None, table_description: str) -> tuple: | |
| """Accept/edit handler: ``apply_card_edits`` -> confirmed ``DataCard``. | |
| Merges the user's one-line description over the proposed card (user wins; the | |
| deterministic profile is never overwritten) and returns | |
| ``(confirmed_card, status_text)``. DISPLAY-ONLY: the confirmed card is stored | |
| for the UI, NOT injected into ``get_system_prompt`` (ADR 0007 — the trained | |
| policy never saw a data-context block; that wiring is deferred to F006 via | |
| ``render_data_context``). | |
| """ | |
| if card_state is None: | |
| return None, "Upload a CSV first to see and confirm its summary." | |
| edits: dict = {} | |
| if table_description.strip(): | |
| edits["table_description"] = table_description.strip() | |
| confirmed = apply_card_edits(card_state, edits) | |
| return confirmed, "Saved — your summary is confirmed (it's advisory only)." | |
| # --------------------------------------------------------------------------- | |
| # build_demo — construct (do NOT launch) the single-column Blocks demo. | |
| # --------------------------------------------------------------------------- | |
| # ZeroGPU: the whole agentic loop runs in ONE @spaces.GPU window per question | |
| # (ZeroGPU spawns a fresh GPU process per call with NO cross-call KV cache, so we | |
| # never split the 4-6 tool-call turns across calls). 120s is generous headroom for | |
| # the multi-turn loop incl. first-call model materialization; the GPU time actually | |
| # billed is real usage, not the reservation. | |
| _GPU_DURATION_S = 120 | |
| def _run_rollout_gpu( | |
| question: str, db_dir: str, db_id: str, model_key: str | |
| ) -> Iterator[Step]: | |
| """Stream one full real-model agentic rollout, yielding a ``Step`` per tool call. | |
| A GENERATOR so the UI shows steps LIVE as the model produces them. When wrapped | |
| by ``@spaces.GPU`` the ZeroGPU allocation is held for the whole generator — one | |
| GPU window for the 4-6 turns (ZeroGPU has no cross-call KV cache), but each step | |
| streams back as it resolves instead of all at once at the end. | |
| Built from PRIMITIVES (no live objects) so it is safe inside the fork: it builds | |
| the env + a cache-backed ``ModelPolicy`` here and drives ``run_agent_turn`` (the | |
| SAME loop the demo + eval use). ``serving.get_policy`` reuses ``ModelPolicy`` -> | |
| ``server/tooling.py`` so the tool-call format matches training exactly (the | |
| "never write a second parser" invariant). | |
| """ | |
| try: | |
| from .serving import get_policy | |
| except ImportError: # pragma: no cover - flat-layout fallback | |
| from server.serving import get_policy # type: ignore[no-redef] | |
| env = _build_environment(db_dir) | |
| env.begin_episode(db_id, question) | |
| policy = get_policy(model_key) | |
| yield from run_agent_turn(question, env, policy, budget=_STEP_BUDGET) | |
| def build_demo( | |
| policy_factory: Callable[[], Policy] | None = None, | |
| *, | |
| enable_model_selector: bool = False, | |
| ) -> gr.Blocks: | |
| """Construct (do NOT launch) the single-column Blocks demo. | |
| Injects the policy via ``policy_factory`` (called once per ask). Defaults to a | |
| deterministic demo-script policy (NO torch import) so the app builds and tests | |
| run headless with no model. Returns a ``gr.Blocks`` instance; the caller | |
| launches it via ``_launch_demo`` so the stashed theme + CSS reach ``.launch()`` | |
| (gradio 6.x). | |
| ``enable_model_selector`` (F006): when True, render a model dropdown and route a | |
| real-model ask through ``serving.get_policy`` inside one ``@spaces.GPU`` window | |
| (on a ZeroGPU Space; a no-op decorator off-Space). The default (False) is | |
| byte-identical to the no-model demo so the existing headless tests are unchanged. | |
| """ | |
| factory = policy_factory or _default_policy_factory | |
| # Real-model serving is opt-in (the Space passes enable_model_selector=True). | |
| # Import the serving registry + wrap the rollout with ZeroGPU LAZILY here so the | |
| # default headless build stays torch/spaces-free (the dep-light invariant). | |
| serving = None | |
| gpu_rollout = _run_rollout_gpu | |
| if enable_model_selector: | |
| try: | |
| from . import serving as serving # type: ignore[no-redef] | |
| except ImportError: # pragma: no cover - flat-layout fallback | |
| from server import serving as serving # type: ignore[no-redef] | |
| try: | |
| import spaces # noqa: PLC0415 | |
| gpu_rollout = spaces.GPU(duration=_GPU_DURATION_S)(_run_rollout_gpu) | |
| except ImportError: | |
| # Not on a ZeroGPU Space (e.g. local dev): run the rollout undecorated. | |
| gpu_rollout = _run_rollout_gpu | |
| # FR1 (WCAG AA): override the primary-button accent to a darker orange so the | |
| # white CTA label clears 4.5:1 (the default orange-500 was 2.80:1). #C2410C = | |
| # 5.18:1, hover #9A3412 = 7.31:1 (white-on-accent). Single semantic accent. | |
| # NOTE: gradio 6.x deprecated `theme`/`css` on the Blocks constructor in favour | |
| # of `.launch()` — BUT the constructor still ACCEPTS them via a back-compat shim | |
| # that stows them in `self._deprecated_{css,theme}`, and `.launch()` falls back | |
| # to exactly those when its own `css`/`theme` args are None. So passing them to | |
| # the constructor (below) makes the theme + CSS apply no matter HOW the app is | |
| # launched — `_launch_demo`, `gradio app.py` hot-reload, a deploy target, or HF | |
| # Spaces' own auto-`.launch()` — instead of only when our launch site threads | |
| # them. (A launch-path that bypassed `_launch_demo` was shipping an unstyled, | |
| # un-themed page.) `_launch_demo` still passes them too; identical value, no-op. | |
| theme = gr.themes.Soft( | |
| primary_hue="orange", | |
| neutral_hue="slate", | |
| spacing_size="sm", | |
| radius_size="sm", | |
| # IBM Plex Sans/Mono — the design handover's typeface (warmer + more | |
| # characterful than system-ui), loaded via GoogleFont with a system-ui | |
| # fallback so a blocked CDN degrades gracefully rather than breaking paint. | |
| font=[gr.themes.GoogleFont("IBM Plex Sans"), "system-ui", "sans-serif"], | |
| font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "monospace"], | |
| ).set( | |
| # Tighten the bare-Soft "prototype" look: flat blocks, lighter titles. | |
| block_shadow="none", | |
| block_title_text_weight="500", | |
| # Single semantic accent; white label clears AA on #C2410C (5.18:1). | |
| button_primary_background_fill=_ACCENT, | |
| button_primary_background_fill_hover=_ACCENT_HOVER, | |
| button_primary_text_color="white", | |
| button_primary_text_color_hover="white", | |
| ) | |
| with gr.Blocks( | |
| title="analyst-buddy", fill_width=False, theme=theme, css=_APP_CSS | |
| ) as demo: | |
| # Per-session DB pointer: which data the next ask runs against. | |
| db_state = gr.State(_default_db_state()) | |
| # Confirmed advisory data card (display-only; never injected into the prompt). | |
| card_state: gr.State = gr.State(None) | |
| # Snapshot of the answer currently on screen — {question, dataset, answer} | |
| # captured at ask time so feedback is bound to the answer being voted on, | |
| # NOT the live (possibly chip-edited) question textbox (staff review M2/M3). | |
| asked_state: gr.State = gr.State({}) | |
| # F006: which model the next ask runs against. Hosted default = the | |
| # FINE-TUNED Qwen3-1.7B (published to the Hub 2026-06-14), so judges see the | |
| # actual product, not the 4%-correct off-the-shelf model. Demo + vanilla | |
| # stay selectable. Only created when the selector is enabled. | |
| model_state: gr.State | None = ( | |
| gr.State("sqlenv-1.7b-grpo-v2") if enable_model_selector else None | |
| ) | |
| _default_persona = personas.default_persona_key() | |
| _db_root = str(_DB_ROOT) | |
| # --- Top "board": hero + ask + examples + personas + data overview wrapped | |
| # in ONE bordered card, so the whole "value prop → ask → samples" flow reads | |
| # as a single unit (user request). ---------------------------------------- | |
| with gr.Group(elem_classes="ab-top-card"): | |
| gr.HTML(_HERO_HTML_HOSTED if enable_model_selector else _HERO_HTML_LOCAL) | |
| # Ask box (chat-composer style: a roomy input + a simple send button). | |
| # No autofocus: it scrolled the hero off-screen on load (01-ux-polish S1). | |
| with gr.Row(elem_classes="ab-ask-row"): | |
| question = gr.Textbox( | |
| show_label=False, | |
| container=False, | |
| placeholder="e.g. Which store has the most orders?", | |
| lines=2, | |
| max_lines=6, | |
| scale=6, | |
| ) | |
| ask_btn = gr.Button("Ask →", variant="primary", scale=1) | |
| # Animated working indicator (CSS spinner) — hidden until an ask runs. | |
| status = gr.HTML("", visible=False) | |
| # Per-persona example questions — relabelled on persona change; each fills | |
| # the question box. No heading (it's obvious these are examples). | |
| with gr.Row(elem_classes="ab-starter-row"): | |
| starter_btns = [ | |
| gr.Button(text, variant="secondary", size="sm") | |
| for text in personas.starters_for(_default_persona) | |
| ] | |
| # Persona picker (samples to try; CSV "soon" as a dashed chip). | |
| gr.Markdown( | |
| "**Try it on a sample business — it works on multiple databases:**", | |
| elem_classes="ab-center", | |
| ) | |
| with gr.Row(elem_classes="ab-persona-row"): | |
| persona_btns = { | |
| persona.key: gr.Button( | |
| persona.label, variant="secondary", size="sm" | |
| ) | |
| for persona in personas.PERSONAS.values() | |
| } | |
| gr.Button( | |
| "Your data (CSV) · soon", | |
| variant="secondary", | |
| size="sm", | |
| interactive=False, | |
| elem_classes="ab-csv-soon", | |
| ) | |
| persona_desc = gr.Markdown( | |
| f"_{personas.get_persona(_default_persona).blurb}_", | |
| elem_classes="ab-center", | |
| ) | |
| with gr.Accordion("What's in this data", open=False): | |
| demo_dataset_view = gr.HTML( | |
| personas.persona_card_html(_default_persona, _db_root) | |
| ) | |
| # Upload is "coming soon" (cued by the dashed CSV chip above). The real | |
| # upload + data-card components live here, hidden, until it's ready — kept in | |
| # the tree so the wiring + upload-flow tests still resolve. | |
| with gr.Group(visible=False): | |
| upload = gr.File( | |
| label="Upload your CSV (coming soon)", | |
| file_types=[".csv"], | |
| file_count="single", | |
| interactive=False, | |
| ) | |
| upload_status = gr.Markdown("") | |
| data_card_view = gr.Markdown("") | |
| card_edit = gr.Textbox( | |
| label="What's in this data? (optional one-line description)", | |
| placeholder="e.g. weekly sales rows, one per store", | |
| visible=False, | |
| ) | |
| accept_card_btn = gr.Button( | |
| "Accept this summary", variant="secondary", visible=False | |
| ) | |
| # The model picker lives in the quiet "settings" footer at the very bottom | |
| # (built after the answer zone, next to "Why a tiny model is enough"). | |
| model_dropdown = None | |
| # Answer zone: ONE block, revealed at ask START (see _set_busy). The live | |
| # step-trace streams FIRST (above), then the Result appears below it — the | |
| # layout the user preferred. _set_busy also clears the previous trace/result | |
| # the moment Ask is clicked, so stale content never lingers. | |
| # - trace_acc (the numbered steps + the exact SQL): streams above; stays | |
| # open so the steps are visible (no auto-collapse). | |
| # - result_group (the Result table + chart + provenance): hidden until the | |
| # answer completes, so no empty table flickers during streaming. | |
| # - feedback_group: revealed only once an answer exists. | |
| with gr.Group(visible=False) as answer_group: | |
| # "Show its work": the live numbered trace (layout="panel" — a tool-call | |
| # trace, not a chat) + the exact SQL it wrote. Streams above the answer. | |
| with gr.Accordion( | |
| "🔎 How it figured this out — its steps + the exact SQL", open=True | |
| ) as trace_acc: | |
| chatbot = gr.Chatbot( | |
| label="How it figured this out", | |
| show_label=False, | |
| height=300, | |
| layout="panel", | |
| elem_classes="ab-trace", | |
| ) | |
| sql_code = gr.Code( | |
| label="The exact query it wrote", | |
| language="sql", | |
| elem_classes="ab-sql", | |
| ) | |
| with gr.Group( | |
| visible=False, elem_classes="ab-answer-card" | |
| ) as result_group: | |
| # Chart + table side by side (v3 answer card). ONE gr.Plot fed an | |
| # interactive Altair (vega-lite) chart, hidden until a valid chart- | |
| # intent renders (FR4) — a single component avoids the multi-turn | |
| # ResizeObserver crash while keeping hover tooltips. | |
| with gr.Row(): | |
| chart_plot = gr.Plot(show_label=False, visible=False) | |
| result_table = gr.Dataframe( | |
| label="Result", interactive=False, wrap=True | |
| ) | |
| provenance = gr.Markdown(_EMPTY_PROVENANCE, elem_classes="ab-prov") | |
| # --- Feedback flywheel: the verdict + correction sharpens the AGENT'S SQL | |
| # skill (a labelled example) — we never train on the user's data (v3 lock). | |
| with gr.Group( | |
| visible=False, elem_classes="ab-feedback-bar" | |
| ) as feedback_group: | |
| with gr.Row(elem_classes="ab-feedback-row"): | |
| gr.Markdown("**Was this right?**", elem_classes="ab-fb-label") | |
| fb_up = gr.Button("👍 Yes", variant="secondary", scale=0) | |
| fb_down = gr.Button("👎 No", variant="secondary", scale=0) | |
| # The reassurance, said ONCE — pushed to the right of the row. | |
| gr.Markdown( | |
| "_👍 / 👎 sharpens the agent locally · never trains on " | |
| "your data._", | |
| elem_classes="ab-fb-note", | |
| ) | |
| # The prompt lives in the placeholder (a long `label=` rendered as a | |
| # loud orange block); the reassurance is the note on the right above. | |
| fb_correction = gr.Textbox( | |
| show_label=False, | |
| container=False, | |
| lines=1, | |
| placeholder=( | |
| "What should it be? e.g. it should join orders to stores " | |
| "on store_id" | |
| ), | |
| visible=False, | |
| ) | |
| fb_submit = gr.Button( | |
| "Submit correction", variant="secondary", visible=False | |
| ) | |
| fb_status = gr.Markdown("") | |
| # --- Footer "settings": roadmap + model picker + the GRPO explainer ---- | |
| # Quiet, secondary controls grouped at the very bottom (the Tone-Bridge | |
| # bottom-"Settings" pattern): the rarely-used model picker and the judge- | |
| # facing "Why a tiny model is enough" live here, out of the main flow. | |
| gr.Markdown(_ROADMAP_MD, elem_classes="ab-center ab-footer-note") | |
| if enable_model_selector: | |
| with gr.Accordion("Advanced — choose the model", open=False): | |
| model_dropdown = gr.Dropdown( | |
| choices=serving.dropdown_choices(), | |
| value="sqlenv-1.7b-grpo-v2", | |
| label="Model", | |
| info=( | |
| "Defaults to our fine-tuned Qwen3-1.7B (the real product). " | |
| "Switch to the vanilla model to see the difference, or 'Demo' " | |
| "for an instant offline walkthrough." | |
| ), | |
| ) | |
| model_dropdown.change( | |
| lambda k: k, inputs=[model_dropdown], outputs=[model_state] | |
| ) | |
| _showcase_examples = _load_showcase() | |
| run_live_btns: list[tuple] = [] | |
| with gr.Accordion( | |
| "✨ What we're building: off-the-shelf 4% vs our fine-tune 49%", | |
| open=False, | |
| elem_id="why-tiny", | |
| ): | |
| # ONE dark card: build copy + (when present) the head-to-head nested | |
| # inside it, so the showcase's dark-tuned colours read in both themes. | |
| gr.HTML(_why_card_html(_showcase_examples)) | |
| if _showcase_examples: | |
| if enable_model_selector: | |
| gr.Markdown( | |
| "**Try it yourself — runs on the model picked in " | |
| "'Advanced' above. Switch it to compare:**", | |
| elem_classes="ab-center", | |
| ) | |
| with gr.Row(elem_classes="ab-starter-row"): | |
| for _ex in _showcase_examples[:3]: | |
| _rb = gr.Button( | |
| f"{_ex['question'][:38]}…", | |
| size="sm", | |
| variant="secondary", | |
| ) | |
| run_live_btns.append((_rb, _ex["question"])) | |
| # --- Ask wiring (Step 2.2) ----------------------------------------- | |
| def _ask( | |
| question_text: str, | |
| current_db: dict, | |
| history: list, | |
| model_key: str = "demo", | |
| ): | |
| # Demo key (or selector off) -> the existing deterministic demo path. | |
| # A real, available model -> drive ModelPolicy inside one GPU window | |
| # (serving.get_policy reuses tooling.py -> training-format parity). | |
| if ( | |
| enable_model_selector | |
| and model_key | |
| and model_key != serving.default_model_key() | |
| ): | |
| spec = serving.get_spec(model_key) | |
| if spec is not None and spec.available: | |
| current = current_db or {} | |
| db_dir = current.get("db_dir", "") | |
| db_id = current.get("db_id", "") | |
| def _rollout(): | |
| return gpu_rollout(question_text, db_dir, db_id, model_key) | |
| yield from _handle_ask( | |
| question_text, current_db, history, factory, rollout=_rollout | |
| ) | |
| return | |
| # Selected a not-yet-published model -> honest demo fallback. | |
| yield from _handle_ask(question_text, current_db, history, factory) | |
| ask_outputs = [ | |
| chatbot, | |
| result_table, | |
| sql_code, | |
| provenance, | |
| chart_plot, | |
| ] | |
| def _capture_question(question_text: str, current_db: dict) -> dict: | |
| """Snapshot the asked question + dataset BEFORE streaming, so feedback | |
| binds to this answer even if the textbox is later chip-edited (M2).""" | |
| return { | |
| "question": question_text or "", | |
| "dataset": (current_db or {}).get("db_id", "?"), | |
| "answer": "", | |
| } | |
| def _finish_ask(history: list, asked: dict): | |
| """After streaming: reveal the result + feedback and collapse the live | |
| trace into its expander (m3 clears any stale feedback status). The answer | |
| text is still captured into the snapshot for the feedback record, but it | |
| is NOT shown as a prose line — the Result table is the answer surface.""" | |
| answer = _last_answer_text(history) | |
| snapshot = {**(asked or {}), "answer": answer} | |
| return ( | |
| snapshot, | |
| gr.update(visible=True), # result_group (table + chart) appears below | |
| gr.update(visible=True), # feedback_group | |
| "", # fb_status | |
| ) | |
| # Make "working" unmissable: disable + relabel the button AND show an animated | |
| # CSS spinner. Reveal the answer block + open the trace so the live steps stream | |
| # ABOVE; keep result + feedback hidden until _finish_ask. CRUCIAL: clear the | |
| # PREVIOUS trace/result NOW so the old answer vanishes the moment Ask is clicked | |
| # (rather than lingering until the next answer completes). | |
| def _set_busy(): | |
| return ( | |
| gr.update(value="Working…", interactive=False), | |
| gr.update(value=_SPINNER_HTML, visible=True), | |
| gr.update(visible=True), # answer_group | |
| gr.update(visible=False), # result_group (wait for the answer) | |
| gr.update(open=True), # trace_acc (open while streaming) | |
| gr.update(visible=False), # feedback_group (wait for the answer) | |
| [], # chatbot — clear the old steps immediately | |
| _empty_table(), # result_table — clear the old rows | |
| "", # sql_code — clear the old query | |
| gr.update(value=_EMPTY_PROVENANCE), # provenance reset | |
| gr.update(visible=False), # chart_plot — hide the old chart | |
| ) | |
| def _set_idle(): | |
| return ( | |
| gr.update(value="Ask →", interactive=True), | |
| gr.update(value="", visible=False), | |
| ) | |
| # When the selector is on, the chosen model travels into _ask via model_state | |
| # (appended last so _ask's model_key default holds when the selector is off). | |
| ask_inputs = [question, db_state, chatbot] | |
| if enable_model_selector: | |
| ask_inputs = [question, db_state, chatbot, model_state] | |
| # busy (reveal block + open trace) → capture → stream → finish (answer leads, | |
| # reveal results + feedback, collapse trace) → idle | |
| _busy_outputs = [ | |
| ask_btn, | |
| status, | |
| answer_group, | |
| result_group, | |
| trace_acc, | |
| feedback_group, | |
| chatbot, | |
| result_table, | |
| sql_code, | |
| provenance, | |
| chart_plot, | |
| ] | |
| _finish_outputs = [ | |
| asked_state, | |
| result_group, | |
| feedback_group, | |
| fb_status, | |
| ] | |
| ask_btn.click(_set_busy, outputs=_busy_outputs).then( | |
| _capture_question, inputs=[question, db_state], outputs=[asked_state] | |
| ).then(_ask, inputs=ask_inputs, outputs=ask_outputs).then( | |
| _finish_ask, | |
| inputs=[chatbot, asked_state], | |
| outputs=_finish_outputs, | |
| ).then(_set_idle, outputs=[ask_btn, status]) | |
| question.submit(_set_busy, outputs=_busy_outputs).then( | |
| _capture_question, inputs=[question, db_state], outputs=[asked_state] | |
| ).then(_ask, inputs=ask_inputs, outputs=ask_outputs).then( | |
| _finish_ask, | |
| inputs=[chatbot, asked_state], | |
| outputs=_finish_outputs, | |
| ).then(_set_idle, outputs=[ask_btn, status]) | |
| # Showcase "run this live": load the question, reset to the shop persona | |
| # (so db_state points at retail_smb), force the fine-tuned model, then run | |
| # the normal ask chain — so the judge watches our model answer it for real. | |
| if enable_model_selector and run_live_btns: | |
| def _run_live(q: str): | |
| # Fill the question + switch to the retail demo dataset, but DO NOT | |
| # touch model_state — the ask runs on whatever the "Advanced" dropdown | |
| # has selected, so you can flip vanilla ↔ fine-tuned and compare on the | |
| # same showcase questions. | |
| new_state = personas.db_state_for("shop", _db_root) | |
| card = personas.persona_card_html("shop", _db_root) | |
| desc = f"_{personas.get_persona('shop').blurb}_" | |
| labels = personas.starters_for("shop") | |
| return ( | |
| q, | |
| new_state, | |
| card, | |
| desc, | |
| *[gr.update(value=t, visible=bool(t)) for t in labels], | |
| ) | |
| _rl_outputs = [ | |
| question, | |
| db_state, | |
| demo_dataset_view, | |
| persona_desc, | |
| *starter_btns, | |
| ] | |
| _cap_in = [question, db_state] | |
| for _rb, _q in run_live_btns: | |
| _rb.click(lambda q=_q: _run_live(q), outputs=_rl_outputs).then( | |
| _set_busy, outputs=_busy_outputs | |
| ).then( | |
| _capture_question, inputs=_cap_in, outputs=[asked_state] | |
| ).then(_ask, inputs=ask_inputs, outputs=ask_outputs).then( | |
| _finish_ask, | |
| inputs=[chatbot, asked_state], | |
| outputs=_finish_outputs, | |
| ).then(_set_idle, outputs=[ask_btn, status]) | |
| # Persona switching: rebind the dataset, refresh the overview card, the | |
| # one-line description, and the starter chips. One handler per persona card. | |
| def _on_persona(persona_key: str): | |
| new_state = personas.db_state_for(persona_key, _db_root) | |
| card = personas.persona_card_html(persona_key, _db_root) | |
| desc = f"_{personas.get_persona(persona_key).blurb}_" | |
| labels = personas.starters_for(persona_key) | |
| return ( | |
| new_state, | |
| card, | |
| desc, | |
| *[gr.update(value=text, visible=bool(text)) for text in labels], | |
| ) | |
| for _pkey, _pbtn in persona_btns.items(): | |
| _pbtn.click( | |
| lambda key=_pkey: _on_persona(key), | |
| outputs=[db_state, demo_dataset_view, persona_desc, *starter_btns], | |
| ) | |
| # Each starter button fills the question box with its own current label. | |
| for _starter in starter_btns: | |
| _starter.click(lambda text: text, inputs=[_starter], outputs=[question]) | |
| # --- Feedback wiring (the flywheel) -------------------------------- | |
| def _fb_line() -> str: | |
| n = feedback_summary().get("training_candidates", 0) | |
| if not n: | |
| return "" | |
| return f" _{n} correction(s) collected so far — each sharpens the agent._" | |
| def _on_thumbs_up(asked: dict, sql: str): | |
| asked = asked or {} | |
| record_feedback( | |
| question=asked.get("question", ""), | |
| dataset=asked.get("dataset", "?"), | |
| shown_sql=sql or "", | |
| result=asked.get("answer", ""), | |
| verdict="up", | |
| ) | |
| return ( | |
| f"✓ Thanks — logged.{_fb_line()}", | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| ) | |
| def _on_thumbs_down(): | |
| return ( | |
| "Tell me what it should be — it sharpens the agent (never your data).", | |
| gr.update(visible=True), | |
| gr.update(visible=True), | |
| ) | |
| def _on_correction(asked: dict, sql: str, correction: str): | |
| asked = asked or {} | |
| record_feedback( | |
| question=asked.get("question", ""), | |
| dataset=asked.get("dataset", "?"), | |
| shown_sql=sql or "", | |
| result=asked.get("answer", ""), | |
| verdict="down", | |
| correction=correction or "", | |
| ) | |
| return ( | |
| f"✓ Captured — this sharpens the agent.{_fb_line()}", | |
| gr.update(visible=False, value=""), | |
| gr.update(visible=False), | |
| ) | |
| fb_up.click( | |
| _on_thumbs_up, | |
| inputs=[asked_state, sql_code], | |
| outputs=[fb_status, fb_correction, fb_submit], | |
| ) | |
| fb_down.click( | |
| _on_thumbs_down, | |
| outputs=[fb_status, fb_correction, fb_submit], | |
| ) | |
| fb_submit.click( | |
| _on_correction, | |
| inputs=[asked_state, sql_code, fb_correction], | |
| outputs=[fb_status, fb_correction, fb_submit], | |
| ) | |
| # --- Upload + data-card wiring (Steps 3.1 / 3.2) ------------------- | |
| def _on_upload(file_obj, current_db: dict): | |
| new_db, card, status = _handle_upload(file_obj, current_db) | |
| card_markdown = _render_card_markdown(card) if card is not None else "" | |
| return new_db, card, status, card_markdown | |
| upload.upload( | |
| _on_upload, | |
| inputs=[upload, db_state], | |
| outputs=[db_state, card_state, upload_status, data_card_view], | |
| ) | |
| def _on_accept(card_obj, description: str): | |
| confirmed, status = _handle_accept_card(card_obj, description) | |
| card_markdown = ( | |
| _render_card_markdown(confirmed) if confirmed is not None else "" | |
| ) | |
| return confirmed, status, card_markdown | |
| accept_card_btn.click( | |
| _on_accept, | |
| inputs=[card_state, card_edit], | |
| outputs=[card_state, upload_status, data_card_view], | |
| ) | |
| # Keep references so introspection/tests can find every component. | |
| demo.analyst_components = { # type: ignore[attr-defined] | |
| "db_state": db_state, | |
| "card_state": card_state, | |
| "demo_dataset_view": demo_dataset_view, | |
| "upload": upload, | |
| "upload_status": upload_status, | |
| "data_card_view": data_card_view, | |
| "card_edit": card_edit, | |
| "accept_card_btn": accept_card_btn, | |
| "persona_btns": persona_btns, | |
| "persona_desc": persona_desc, | |
| "starter_btns": starter_btns, | |
| "status": status, | |
| "question": question, | |
| "ask_btn": ask_btn, | |
| "chatbot": chatbot, | |
| "result_table": result_table, | |
| "provenance": provenance, | |
| "sql_code": sql_code, | |
| "chart_plot": chart_plot, | |
| "fb_up": fb_up, | |
| "fb_down": fb_down, | |
| "fb_correction": fb_correction, | |
| "fb_submit": fb_submit, | |
| "fb_status": fb_status, | |
| "answer_group": answer_group, | |
| "result_group": result_group, | |
| "trace_acc": trace_acc, | |
| "feedback_group": feedback_group, | |
| "asked_state": asked_state, | |
| } | |
| if model_dropdown is not None: | |
| demo.analyst_components["model_dropdown"] = model_dropdown | |
| demo.analyst_policy_factory = factory # type: ignore[attr-defined] | |
| # FR1: theme/css travel to .launch() (gradio 6.x ignores them on Blocks). | |
| demo.analyst_theme = theme # type: ignore[attr-defined] | |
| demo.analyst_css = _APP_CSS # type: ignore[attr-defined] | |
| return demo | |
| def _launch_demo(demo: gr.Blocks, **launch_kwargs) -> None: | |
| """Launch ``demo`` applying the stashed theme + CSS (FR1, gradio 6.x). | |
| gradio 6.x ignores ``theme``/``css`` on the ``Blocks`` constructor and reads | |
| them at ``.launch()`` time, so the single launch path threads the theme/css | |
| that ``build_demo`` attached. Any extra ``launch_kwargs`` (e.g. ``server_port``) | |
| pass through unchanged. | |
| """ | |
| demo.launch( | |
| theme=getattr(demo, "analyst_theme", None), | |
| css=getattr(demo, "analyst_css", None), | |
| **launch_kwargs, | |
| ) | |
| def _default_db_state() -> dict: | |
| """The preloaded demo dataset pointer used until the user uploads a CSV.""" | |
| return { | |
| "db_id": _DEMO_DB_ID, | |
| "db_path": str(_DB_ROOT / _DEMO_DB_ID / f"{_DEMO_DB_ID}.sqlite"), | |
| "db_dir": str(_DB_ROOT), | |
| "table": "orders", | |
| } | |
| def _build_environment(db_dir: str) -> SQLEnvironment: | |
| """Build a SQLEnvironment rooted at ``db_dir`` for non-gold (user) episodes. | |
| ``SQLEnvironment.__init__`` requires a questions file (the gold path), but the | |
| demo only ever uses ``begin_episode`` + ``step`` — never ``reset()``. We point | |
| it at the repo questions file when present, else the COMMITTED | |
| ``data/demo_questions.json`` (Q2: no per-ask write into the user-upload dir), | |
| so the constructor's gold-path validation passes without affecting the | |
| non-gold loop. | |
| """ | |
| questions_path = _QUESTIONS_PATH | |
| if not questions_path.exists(): | |
| questions_path = _DEMO_QUESTIONS_PATH | |
| from .mock_tokenizer import MockTokenizer # local import: dep-light path | |
| return SQLEnvironment( | |
| questions_path=str(questions_path), | |
| db_dir=db_dir, | |
| tokenizer=MockTokenizer(), | |
| step_budget=_STEP_BUDGET, | |
| ) | |
| if __name__ == "__main__": | |
| _launch_demo(build_demo()) | |