""" Single-page Gradio UI for the Hugging Face Space (same process as the OpenEnv FastAPI API). Playground uses POST /reset and POST /step via loopback HTTP with X-Session-Id. """ from __future__ import annotations import json import os import uuid from pathlib import Path from typing import Any, Optional, Tuple import httpx COLAB_FIRST_TRAINING = ( "https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5" "#scrollTo=j-9MptXvmPk8" ) COLAB_TRAINING_ROOT = ( "https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5" "#scrollTo=x5YuvatGyyu_" ) HF_SPACE = "https://huggingface.co/spaces/md896/sql-debug-env" HF_SAMPLE_REWARDS = ( "https://huggingface.co/spaces/md896/sql-debug-env/tree/main/" "artifacts/runs/20260426-064318-sample-rewards-32eval" ) HF_EVAL_32 = ( "https://huggingface.co/spaces/md896/sql-debug-env/tree/main/" "artifacts/runs/20260426-060502-final-pass-32eval" ) HF_MODEL = "https://huggingface.co/md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2" GITHUB_REPO = "https://github.com/mdayan8/sql-debug-env.git" WANDB_TRAINING_RUN = "https://wandb.ai/mdayanbag-pesitm/sql-debug-grpo-best-budget/workspace?nw=nwusermdayanbag" GCLOUD_TEXT2SQL_BLOG = "https://cloud.google.com/blog/products/databases/techniques-for-improving-text-to-sql" OURBENCH_PAPER = "https://arxiv.org/abs/2601.18119" PREDEFINED_QUERIES: dict[str, list[tuple[str, str]]] = { "easy_syntax_fix": [ ("Broken baseline: typo table", "SELECT * FROM userss;"), ("Simple lookup", "SELECT id, name FROM users ORDER BY id LIMIT 10;"), ("Potential invalid write", "UPDATE users SET name='test';"), ], "medium_logic_fix": [ ("Broken: missing GROUP BY", "SELECT department, COUNT(*) FROM employees;"), ("Revenue by month", "SELECT strftime('%Y-%m', order_date) AS ym, SUM(amount) FROM orders GROUP BY ym ORDER BY ym;"), ("Top entities", "SELECT customer_id, SUM(total) AS spend FROM invoices GROUP BY customer_id ORDER BY spend DESC LIMIT 5;"), ], "hard_multi_bug": [ ("Broken join alias", "SELECT u.name, o.total FROM users u JOIN orders o ON user.id = o.user_id;"), ("Join + aggregate", "SELECT p.category, AVG(p.price) AS avg_price FROM products p GROUP BY p.category ORDER BY avg_price DESC;"), ("Nested query", "SELECT name FROM customers WHERE id IN (SELECT customer_id FROM orders GROUP BY customer_id HAVING COUNT(*) > 2);"), ], "hard_finance_explosion": [ ("Broken finance calc", "SELECT account_id, SUM(amount) / COUNT(*) AS risk FROM txn GROUP BY account;"), ("PnL-style aggregate", "SELECT symbol, SUM(CASE WHEN side='BUY' THEN -notional ELSE notional END) AS pnl FROM trades GROUP BY symbol ORDER BY pnl DESC;"), ("Daily exposure", "SELECT date(trade_ts) AS d, SUM(abs(notional)) AS exposure FROM trades GROUP BY d ORDER BY d;"), ], } GRADIO_CSS = """ :root { --sde-ink: #0f172a; --sde-muted: #64748b; --sde-line: #e2e8f0; --sde-card: #ffffff; --sde-glow: radial-gradient(120% 140% at 0% 0%, rgba(45, 212, 191, 0.22) 0%, rgba(45, 212, 191, 0) 52%), radial-gradient(120% 140% at 100% 0%, rgba(147, 197, 253, 0.22) 0%, rgba(147, 197, 253, 0) 55%), linear-gradient(132deg, #0f172a 0%, #1e293b 45%, #0f766e 100%); /* HF Space embed is often dark; keep prose readable (theme alone used near-black on black). */ --sde-body-text: #e2e8f0; --sde-heading-text: #f8fafc; } .gradio-container { max-width: 1180px !important; margin-left: auto !important; margin-right: auto !important; color: #e2e8f0 !important; } /* Gradio 5 + HF: markdown / prose defaults can match a dark shell and disappear */ .gradio-container .prose, .gradio-container .prose p, .gradio-container .prose li, .gradio-container .prose td, .gradio-container .prose th { color: var(--sde-body-text) !important; } .gradio-container .prose h1, .gradio-container .prose h2, .gradio-container .prose h3, .gradio-container .prose h4, .gradio-container .prose strong { color: var(--sde-heading-text) !important; } .gradio-container .prose a { color: #7dd3fc !important; } .gradio-container .prose code { color: #fef3c7 !important; background: rgba(15, 23, 42, 0.55) !important; } .sde-hero-wrap { background: var(--sde-glow); color: #f8fafc; border-radius: 20px; padding: 1.75rem 1.5rem 1.5rem; margin-bottom: 1.25rem; border: 1px solid rgba(148, 163, 184, 0.24); box-shadow: 0 18px 40px rgba(15, 23, 42, 0.20), inset 0 1px 0 rgba(255, 255, 255, 0.12); } .sde-hero-wrap .sde-hero-title { margin: 0 0 0.35rem 0; font-size: 1.85rem; font-weight: 800; letter-spacing: -0.03em; line-height: 1.2; color: #f8fafc !important; } .sde-hero-wrap .sde-hero-lede { margin: 0 0 0.5rem 0; color: #e2e8f0 !important; font-size: 0.95rem; line-height: 1.55; } .sde-hero-subnav { margin-bottom: 0.75rem; font-size: 0.88rem; } .sde-hero-wrap .sde-hero-subnav a { color: #a5f3fc !important; font-weight: 600; } .sde-pill-row { display: flex; flex-wrap: wrap; gap: 0.5rem; margin-top: 1rem; } .sde-pill { display: inline-block; padding: 0.35rem 0.75rem; border-radius: 999px; font-size: 0.72rem; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase; background: rgba(15, 23, 42, 0.26); border: 1px solid rgba(226, 232, 240, 0.34); color: #f8fafc; } .sde-section-title { font-size: 1.05rem; font-weight: 700; color: var(--sde-heading-text) !important; margin: 1.5rem 0 0.75rem 0; letter-spacing: -0.02em; } .sde-muted-caption { color: #94a3b8 !important; font-size: 0.9rem; } .sde-link-row a { color: #7dd3fc !important; font-weight: 600; margin-right: 1rem; } .sde-kpi-grid { display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 0.75rem; margin: 0.5rem 0 1rem; } .sde-kpi { background: #ffffff; border: 1px solid #dbe3f0; border-radius: 14px; padding: 0.85rem 0.95rem; box-shadow: 0 10px 24px rgba(15, 23, 42, 0.06); } .sde-kpi .v { font-size: 1.25rem; font-weight: 800; letter-spacing: -0.02em; color: #0f172a; } .sde-kpi .k { margin-top: 0.15rem; font-size: 0.73rem; text-transform: uppercase; letter-spacing: 0.06em; color: #64748b; } .sde-callout { border-left: 4px solid #2563eb; background: #eff6ff; color: #1e3a8a; padding: 0.7rem 0.8rem; border-radius: 8px; margin: 0.5rem 0 0.75rem; font-size: 0.86rem; } @media (max-width: 900px) { .sde-kpi-grid { grid-template-columns: repeat(2, minmax(0, 1fr)); } } """ def _api_base() -> str: return os.environ.get( "INTERNAL_API_BASE", f"http://127.0.0.1:{os.environ.get('PORT', '7860')}", ).rstrip("/") def _blog_url() -> str: return (os.environ.get("BLOG_URL") or "").strip() def _http() -> httpx.Client: return httpx.Client(timeout=120.0) def _img_path(static_dir: Path, *names: str) -> Optional[str]: for n in names: p = static_dir / n if p.is_file(): return str(p.resolve()) return None def _preset_options(task_id: str) -> list[str]: return [name for name, _ in PREDEFINED_QUERIES.get(task_id, [])] def _preset_query(task_id: str, preset_name: str) -> str: for name, query in PREDEFINED_QUERIES.get(task_id, []): if name == preset_name: return query return "" def _safe_reward(value: Any) -> float: try: return float(value) except Exception: return 0.0 def build_blocks(static_dir: Path) -> Any: import gradio as gr wf = _img_path(static_dir, "diagram-end-to-end-workflow.png", "environment-workflow.png") chart_leap = _img_path(static_dir, "chart-performance-leap.png", "hero_performance_leap.png") chart_dual = _img_path(static_dir, "chart-comparison-shift.png", "hero_dual_benchmark.png") chart_spider = _img_path(static_dir, "chart-spider-benchmark.png", "hero_spider_sota.png") proof_combo = _img_path(static_dir, "proof-combo.png") proof_dist = _img_path(static_dir, "proof-distribution-shift.png") final_gallery_paths = [ "training_reward_curve_final.png", "training_diagnostics_dual_axis_final.png", "baseline_vs_trained_by_task_final.png", "task_delta_post_minus_base_final.png", "reward_distribution_shift_red_green_final.png", "presentation_combo_final.png", "benchmark_style_summary_final.png", "checkpoint_leaderboard_step_vs_reward_final.png", "cost_vs_performance_final.png", ] final_gallery: list[tuple[str, str]] = [] for filename in final_gallery_paths: path = _img_path(static_dir, filename) if path: title = filename.replace("_final.png", "").replace("_", " ").title() final_gallery.append((path, title)) blog = _blog_url() blog_md = ( f"### Blog\n[Read the write-up]({blog})" if blog else "### Blog\nAdd a **Space secret** named `BLOG_URL` with your post URL (e.g. Medium, personal site, or Hugging Face blog)." ) task_choices = [ "easy_syntax_fix", "medium_logic_fix", "hard_multi_bug", "hard_finance_explosion", ] def reset_fn( task_id: str, session_id: Optional[str] ) -> Tuple[str, str, str, str]: sid = session_id or str(uuid.uuid4()) try: with _http() as client: r = client.post( f"{_api_base()}/reset", json={"task_id": task_id}, headers={"X-Session-Id": sid}, ) r.raise_for_status() data = r.json() except Exception as e: err = {"error": str(e), "hint": "Is the server listening on PORT?"} return json.dumps(err, indent=2), "", sid, f"Session: `{sid}` · **error**" obs = json.dumps(data, indent=2) q = (data.get("observation") or {}).get("original_query") or "" return obs, q, sid, f"Session: `{sid}`" def submit_fn( query: str, session_id: Optional[str] ) -> Tuple[str, str]: if not session_id: return ( json.dumps({"error": "Click “Reset task” first to create a session."}, indent=2), "", ) payload = {"action": {"action_type": "submit_query", "query": query or ""}} try: with _http() as client: r = client.post( f"{_api_base()}/step", json=payload, headers={"X-Session-Id": session_id}, ) r.raise_for_status() data = r.json() except httpx.HTTPStatusError as e: try: detail = e.response.json() except Exception: detail = e.response.text return json.dumps({"error": str(e), "detail": detail}, indent=2), "" except Exception as e: return json.dumps({"error": str(e)}, indent=2), "" out = json.dumps(data, indent=2) reward = data.get("reward") done = data.get("done") return out, f"**reward** `{reward}` · **done** `{done}`" def run_preset_suite( task_id: str, session_id: Optional[str] ) -> Tuple[str, str, str, str]: sid = session_id or str(uuid.uuid4()) presets = PREDEFINED_QUERIES.get(task_id, []) if not presets: return "No presets for selected task.", "{}", sid, f"Session: `{sid}`" rows: list[str] = [] rewards: list[float] = [] done_count = 0 error_count = 0 with _http() as client: for idx, (name, query) in enumerate(presets, start=1): try: client.post( f"{_api_base()}/reset", json={"task_id": task_id}, headers={"X-Session-Id": sid}, ).raise_for_status() step_resp = client.post( f"{_api_base()}/step", json={"action": {"action_type": "submit_query", "query": query}}, headers={"X-Session-Id": sid}, ) step_resp.raise_for_status() data = step_resp.json() reward = _safe_reward(data.get("reward")) done = bool(data.get("done")) info = data.get("info") or {} label = "pass" if reward >= 0.5 else "check" rewards.append(reward) done_count += int(done) note = "review_rejected" if info.get("review_rejected") else "" rows.append( f"| {idx} | {name} | `{reward:.3f}` | `{done}` | {label} {note} |" ) except Exception as e: error_count += 1 rows.append( f"| {idx} | {name} | `0.000` | `False` | error: {str(e)[:120]} |" ) avg_reward = (sum(rewards) / len(rewards)) if rewards else 0.0 max_reward = max(rewards) if rewards else 0.0 min_reward = min(rewards) if rewards else 0.0 suite_md = ( "#### Preset suite report\n" "| # | Preset | Reward | Done | Note |\n" "|---|---|---:|:---:|---|\n" + "\n".join(rows) + "\n\n" + f"**Summary:** avg reward `{avg_reward:.3f}` · min `{min_reward:.3f}` · max `{max_reward:.3f}` · " f"done count `{done_count}` · errors `{error_count}`" ) suite_json = json.dumps( { "task_id": task_id, "session_id": sid, "n_presets": len(presets), "avg_reward": round(avg_reward, 4), "min_reward": round(min_reward, 4), "max_reward": round(max_reward, 4), "done_count": done_count, "error_count": error_count, }, indent=2, ) return suite_md, suite_json, sid, f"Session: `{sid}`" font = gr.themes.GoogleFont("Plus Jakarta Sans") mono = gr.themes.GoogleFont("JetBrains Mono") theme = gr.themes.Soft( primary_hue="indigo", secondary_hue="slate", neutral_hue="slate", font=(font, "ui-sans-serif", "system-ui"), font_mono=(mono, "ui-monospace", "monospace"), ) with gr.Blocks( title="SQL Debug Environment", analytics_enabled=False, theme=theme, css=GRADIO_CSS, ) as demo: gr.HTML( """
HTML demo (main Space page) · OpenEnv API
SQL Debug Environment

OpenEnv-compliant SQL repair · live SQLite rewards · TRL / GRPO training on this same Space. One page: benchmarks, artifacts, architecture, and a live playground.

OpenEnv FastAPI Gradio TRL · GRPO
""".strip(), elem_id="sde-hero", ) gr.Markdown( "### First context: training proof first\n" f"- **Source code:** [GitHub — mdayan8/sql-debug-env]({GITHUB_REPO})\n" f"- **First training notebook (auto-install cell):** [Open in Colab]({COLAB_FIRST_TRAINING})\n" f"- **Full training Colab (root anchor):** [Open in Colab]({COLAB_TRAINING_ROOT})\n" f"- **Weights & Biases (example run):** [Dashboard]({WANDB_TRAINING_RUN})\n" f"- **Sample-reward eval artifacts (32-run JSON on Hub):** [Browse files]({HF_SAMPLE_REWARDS})\n" f"- **Earlier 32-eval pass folder:** [Browse files]({HF_EVAL_32})\n" f"- **Trained model card:** [md896/sql-debug-agent…]({HF_MODEL})\n" f"- **This Space:** [{HF_SPACE}]({HF_SPACE})" ) gr.HTML( """
0.5B → 7B
Model progression
32-run eval
Final artifact pass
78.5%
Spider-style headline
Execution reward
Primary training signal
""".strip() ) gr.HTML( '
Notebook vibe: this page is intentionally written as field notes + reproducible cells, not a static deck. Every number should map to an artifact.
' ) gr.Code( label="First training context cell (from your Colab)", language="python", interactive=False, value=( "# 🏆 SQL Debug Env: FINAL REAL-WORLD BRIDGE\n" "import os\n" "print('📦 Checking libraries...')\n" "os.system('pip install trl accelerate wandb -U')\n\n" "import httpx\n" "import torch\n" ), lines=8, ) gr.Markdown( "### Lab notebook stats (TL;DR)\n" "- First training pass started with **Qwen/Qwen2.5-Coder-0.5B-Instruct** for environment wiring and fast iteration.\n" "- Main training/eval track used **Qwen/Qwen2.5-Coder-7B-Instruct** with execution-grounded reward loops.\n" "- Final reporting is tied to run artifacts and static charts committed under `server/static/`." ) gr.Markdown( "| Track | Model | Role | Evidence |\n" "|---|---|---|---|\n" "| First bridge run | Qwen/Qwen2.5-Coder-0.5B-Instruct | Fast validation of API/reward loop and notebook flow | First training context + W&B run |\n" "| Base reference | Qwen/Qwen2.5-Coder-7B-Instruct | Baseline behavior before RL updates | Spider/comparison charts |\n" "| Current agent | RL-updated checkpoint on 7B track | Improved execution-grounded SQL fixing | HF model + eval artifacts + sample rewards |\n" ) gr.Markdown( "### Run timeline (quick history)\n" "| Stage | What happened | Why it matters |\n" "|---|---|---|\n" "| Bridge run | Fast setup with **Qwen2.5-Coder-0.5B** | Validated API + reward wiring quickly |\n" "| Main baseline | Moved to **Qwen2.5-Coder-7B-Instruct** | Better capacity for SQL structure + joins |\n" "| RL iterations | Session-consistent reset/step reward loop | Converted text quality into runtime behavior |\n" "| Hard-test reporting | `presentation_graphs_out_final` committed under `server/static/` | Keeps evaluation auditable on-page |\n" ) gr.Code( label="Notebook cell: baseline evaluator sketch (7B track)", language="python", interactive=False, value=( "from transformers import AutoTokenizer, AutoModelForCausalLM\n" "MODEL = 'Qwen/Qwen2.5-Coder-7B-Instruct'\n" "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n" "model = AutoModelForCausalLM.from_pretrained(MODEL, device_map='auto')\n" "# generate SQL -> POST /reset -> POST /step -> score by execution reward\n" ), lines=7, ) gr.Code( label="Notebook cell: regenerate presentation plots from real artifacts", language="shell", interactive=False, value=( "python presentation_graphs.py \\\n" " --sample-rewards-json artifacts/runs/20260426-064318-sample-rewards-32eval/sample_rewards_final.json \\\n" " --output-dir presentation_graphs_out_final\n" "cp presentation_graphs_out_final/*.png server/static/\n" ), lines=5, ) gr.Code( label="Notebook cell: live reward loop (execution-grounded)", language="python", interactive=False, value=( "with httpx.Client(base_url=ENV_URL, timeout=30.0) as client:\n" " client.post('/reset', json={'task_id': task}, headers={'X-Session-Id': sid})\n" " resp = client.post('/step', json={'action': {'action_type': 'submit_query', 'query': sql}},\n" " headers={'X-Session-Id': sid})\n" " reward = resp.json().get('reward', 0.0)\n" " # reward drives policy updates and eval comparisons\n" ), lines=7, ) gr.Markdown( "### Failure taxonomy (from runtime debugging)\n" "| Failure type | Typical symptom | Why execution feedback helps |\n" "|---|---|---|\n" "| Schema mismatch | unknown table/column | reward drops immediately and error details guide correction |\n" "| Join logic bug | duplicated or missing rows | execution reveals semantic mismatch not visible in text quality |\n" "| Aggregation bug | incorrect GROUP BY totals | deterministic graders expose numerical drift |\n" "| Risky query behavior | unsafe or invalid action | reviewer path blocks while preserving learning signal |\n" ) gr.Markdown('

Benchmark visuals

') gr.Markdown( "| Metric snapshot | Value |\n" "|---|---|\n" "| Spider chart: Industry baseline | **48.2%** |\n" "| Spider chart: Qwen-7B base | **52.4%** |\n" "| Spider chart: RL agent | **78.5%** |\n" "| Performance leap chart | **0.0% -> 25.0%** (base to RL in that run view) |\n" ) with gr.Row(equal_height=True): if chart_leap: gr.Image(value=chart_leap, label="Performance leap (Spider-style)", type="filepath", scale=1) if chart_dual: gr.Image(value=chart_dual, label="Comparison + reward shift", type="filepath", scale=2) if chart_spider: gr.Image(value=chart_spider, label="Spider-style headline chart", type="filepath", scale=1) gr.Markdown( '

Training run charts (repo static)

' 'Training plots from real runs. Regenerate with `presentation_graphs.py`; commit PNGs under `server/static/`.' ) with gr.Row(): if proof_combo: gr.Image(value=proof_combo, label="Presentation combo", type="filepath", scale=1) if proof_dist: gr.Image(value=proof_dist, label="Reward distribution shift", type="filepath", scale=1) if final_gallery: gr.Markdown( '

Hard-testing proof set (presentation_graphs_out_final)

' "All generated graphs from the final evaluation set." ) gr.Gallery( value=final_gallery, label="Final hard-testing charts", preview=True, columns=3, height="auto", object_fit="contain", ) gr.Markdown('

Environment architecture

') if wf: gr.Image(value=wf, label="End-to-end workflow", type="filepath", show_label=True) else: gr.Markdown("*Add `server/static/diagram-end-to-end-workflow.png`*") gr.Markdown( '

OpenEnv HTTP API

' f"`GET /health` · `GET /tasks` · `POST /reset` · `POST /step` · `POST /step_with_review` · `GET /state` · `GET /benchmark` · " f"loopback base `{_api_base()}` (override with **INTERNAL_API_BASE**)." ) gr.Markdown('

Live playground

') session = gr.State(None) session_md = gr.Markdown("Session: *click “Reset task”*") with gr.Row(): task = gr.Dropdown( choices=task_choices, value="easy_syntax_fix", label="Task", scale=1, ) btn_reset = gr.Button("Reset task", variant="primary", scale=0, min_width=140) btn_submit = gr.Button("Submit query", variant="secondary", scale=0, min_width=140) btn_run_suite = gr.Button("Run preset suite", variant="secondary", scale=0, min_width=160) preset_name = gr.Dropdown( choices=_preset_options("easy_syntax_fix"), value=_preset_options("easy_syntax_fix")[0], label="Predefined test query", ) btn_load_preset = gr.Button("Load predefined query", variant="secondary") sql = gr.Code(label="Candidate SQL", language="sql", lines=12) result_hint = gr.Markdown("") with gr.Row(): obs_json = gr.Code( language="json", label="Observation (/reset)", lines=12, interactive=False, scale=1, ) step_json = gr.Code( language="json", label="Step (/step)", lines=12, interactive=False, scale=1, ) suite_md = gr.Markdown("") suite_json = gr.Code( label="Preset suite summary", language="json", lines=10, interactive=False, ) btn_reset.click( reset_fn, inputs=[task, session], outputs=[obs_json, sql, session, session_md], ) btn_submit.click( submit_fn, inputs=[sql, session], outputs=[step_json, result_hint], ) task.change( lambda t: gr.Dropdown( choices=_preset_options(t), value=_preset_options(t)[0] if _preset_options(t) else None, ), inputs=[task], outputs=[preset_name], ) btn_load_preset.click( lambda t, p: _preset_query(t, p or ""), inputs=[task, preset_name], outputs=[sql], ) btn_run_suite.click( run_preset_suite, inputs=[task, session], outputs=[suite_md, suite_json, session, session_md], ) gr.Markdown('

Blog

') gr.Markdown(blog_md) gr.Markdown( "### Why I picked SQL debugging and why this architecture exists\n" "“The goal is not to generate beautiful SQL text. The goal is to produce SQL fixes that survive execution, repeatedly, under changing runtime conditions.”\n\n" "### The cost of “almost right” SQL\n" "Industry time-use reporting commonly puts **roughly a quarter to a third** of analytics and data-engineering work into fixing queries and pipelines—" "**not** shipping net-new insights, **not** launching features, but **debugging SQL that already looked reasonable** in a notebook or PR.\n\n" "### Benchmarks vs production\n" "On Spider-style leaderboards, headline numbers often sit in the **high 80s to low 90s (%)**. In messy enterprise warehouses—drifting schemas, implicit business rules, " "join explosions, permissioned views—teams routinely describe effective success rates closer to the **10–30%** band unless the system closes the loop with " "**execution-grounded feedback** (run the SQL, read the error or result, attribute reward to what changed).\n\n" "SQL debugging is one of the few tasks where *language quality* and *system quality* diverge sharply: a query can be neat, plausible, and still fail in production. " "This project forces the agent to optimize for **behavior under execution**, not only fluency under prompting." ) gr.HTML( """
0.5B -> 7B
Model track from first bridge run to main baseline.
32-run eval
Final artifact path with sample rewards and run logs.
Execution-first
Reward is computed from runtime outcomes, not prompt resemblance.
Traceable claims
Metrics should map back to run files and checkpoints.
""".strip() ) gr.Markdown( "#### What leaderboards hide\n" "Canonical text-to-SQL suites are valuable scientific instruments: they keep comparisons honest. They are also cleaner than most corporate warehouses. " "That is why two statements can both be true: models can score **very high** on Spider-style tasks while practitioners still report **low tens to low thirties** " "effective reliability in production unless they invest in harnesses, guardrails, and iterative repair grounded in execution.\n\n" "- **Latency of truth**: prose feedback is fast; execution feedback is slower—and decisive.\n" "- **Credit assignment**: without runtime signal you reward plausible text; with it you reward joins, aggregates, and safe rewrites that actually run.\n" "- **Drift**: schemas evolve; the training surface must stay repeatable even when the world is messy.\n\n" "#### OpenEnv framing (why this is not just a demo UI)\n" "The environment follows an OpenEnv-style interface: `reset -> observation`, `step(action) -> observation, reward, done, info`. " "This is important because it gives the training loop a stable contract. Every algorithmic change can be tested against the same API semantics, which improves reproducibility.\n\n" "#### Reward math (what is actually optimized)\n" "At a high level, each step reward is composed from executed outcomes:\n\n" "\\[\n" "R_t = w_c C_t + w_e E_t + w_p P_t + w_s S_t - \\lambda \\cdot \\text{Penalty}_t\n" "\\]\n\n" "- \\(C_t\\): correctness signal (did query satisfy the task objective)\n" "- \\(E_t\\): execution quality (valid execution / error handling)\n" "- \\(P_t\\): progress toward a valid fix\n" "- \\(S_t\\): schema-aware behavior bonus\n" "- Penalty: unsafe / invalid / degenerate behavior\n\n" "Episode objective:\n\n" "\\[\n" "J(\\pi) = \\mathbb{E}_{\\tau \\sim \\pi}\\left[\\sum_{t=0}^{T} \\gamma^t R_t\\right]\n" "\\]\n\n" "This makes the optimization target explicit: not token similarity, but expected runtime return.\n\n" "#### Architecture decisions that matter technically\n" "1. **Session-isolated database state**: each episode gets a clean in-memory SQLite environment.\n" "2. **Deterministic tasks/graders**: stable reward surfaces for comparison across runs.\n" "3. **Reviewer-guard path**: risk control without collapsing the learning signal.\n" "4. **Typed observations + action history**: easier debugging and post-hoc analysis.\n\n" "#### Data and reporting stats on this page\n" "| Metric | Value | Source |\n" "|---|---:|---|\n" "| Spider-style industry baseline | 48.2% | chart-spider-benchmark |\n" "| Qwen-7B base | 52.4% | chart-spider-benchmark |\n" "| RL agent headline | 78.5% | chart-spider-benchmark |\n" "| Performance leap view | 0.0% -> 25.0% | chart-performance-leap |\n" "| Eval artifact pass | 32-run | HF run folder + sample rewards |\n\n" "#### Why start with 0.5B then move to 7B\n" "The first bridge run on **Qwen2.5-Coder-0.5B** is intentionally about speed of iteration: verify environment wiring, reward path, and notebook workflow quickly. " "The **7B track** is then used for stronger SQL reasoning capacity and better convergence under execution-grounded rewards.\n\n" "#### How to read this Space\n" "- **Diagram** — client → API → env core → data/reward → training and artifacts.\n" "- **Playground** — same `POST /reset` and `POST /step` loop as training, with explicit `X-Session-Id`.\n" "- **Charts + static PNGs** — committed under `server/static/` so claims stay diffable and auditable.\n\n" "#### Motivation recap\n" "I did not build this to prove that a model can emit valid-looking SQL. I built it to make SQL repair measurable as an engineering problem under runtime constraints. " "The evidence-first layout (first context, live loop, artifact chain) is deliberate: each reported number should be traceable to run data, not presentation-only visuals.\n\n" "*Note: percentage ranges summarize common practitioner reporting and public benchmark narratives; your organization’s numbers will differ—treat them as motivation to measure, not as universal constants.*" ) gr.Markdown( f"- [Google Cloud: techniques for improving text-to-SQL]({GCLOUD_TEXT2SQL_BLOG})\n" f"- [OurBench / Squirrel: enterprise SQL debugging benchmark]({OURBENCH_PAPER})\n" f"- [GitHub repository]({GITHUB_REPO})" ) return demo def mount_gradio(app: Any, static_dir: Path) -> Any: """Mount single-page Gradio at `/` (Space home) while API routes stay on the same app.""" import gradio as gr blocks = build_blocks(static_dir) return gr.mount_gradio_app( app, blocks, path="/gradio", allowed_paths=[str(static_dir.resolve())], )