Spaces:

md896
/

sql-debug-env

Running

md896 commited on 12 days ago

Commit

029f9cf

1 Parent(s): 4724001

Ship polished Space UI with Gradio dashboard and evidence-rich demo.

Add the upgraded Gradio experience, structured blog/reporting sections, and curated artifact images while keeping OpenEnv API endpoints intact.

Made-with: Cursor

Files changed (16) hide show

requirements.txt +1 -0
server/demo_page.html +900 -0
server/gradio_ui.py +710 -0
server/main.py +50 -1
server/static/baseline_vs_trained_by_task_final.png +0 -0
server/static/benchmark_style_summary_final.png +0 -0
server/static/chart-comparison-shift.png +0 -0
server/static/chart-performance-leap.png +0 -0
server/static/chart-spider-benchmark.png +0 -0
server/static/checkpoint_leaderboard_step_vs_reward_final.png +0 -0
server/static/cost_vs_performance_final.png +0 -0
server/static/presentation_combo_final.png +0 -0
server/static/proof-combo.png +0 -0
server/static/proof-distribution-shift.png +0 -0
server/static/reward_distribution_shift_red_green_final.png +0 -0
server/static/task_delta_post_minus_base_final.png +0 -0

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ openenv-core>=0.1.0
 openai>=2.0.0
 httpx>=0.27.0
 python-multipart==0.0.9

 openai>=2.0.0
 httpx>=0.27.0
 python-multipart==0.0.9
+gradio>=4.44.0

server/demo_page.html ADDED Viewed

	@@ -0,0 +1,900 @@

+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1, viewport-fit=cover" />
+  <meta name="color-scheme" content="light" />
+  <meta name="theme-color" content="#f6f7fb" />
+  <meta name="description" content="SQL Debug OpenEnv: architecture, live /reset and /step playground, and training evidence. Hugging Face Space." />
+  <meta property="og:title" content="SQL Debug Environment — Space Demo" />
+  <meta property="og:description" content="OpenEnv-compliant SQL debugging environment with live rewards, GRPO training hooks, and reproducible artifacts." />
+  <meta property="og:type" content="website" />
+  <title>SQL Debug Environment · Hugging Face Space</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com" />
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+  <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,500&family=Fraunces:ital,opsz,wght@0,9..144,500;0,9..144,600;0,9..144,700;1,9..144,500&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet" />
+  <style>
+    :root {
+      --space-bg: #f0f2f6;
+      --space-bg-elevated: #fafbfe;
+      --space-border: #e2e6ef;
+      --space-border-strong: #cdd5e5;
+      --ink: #111827;
+      --ink-soft: #374151;
+      --muted: #6b7280;
+      --muted-light: #9ca3af;
+      --card: #ffffff;
+      --card-shadow: 0 1px 2px rgba(16, 24, 40, 0.04), 0 8px 28px rgba(16, 24, 40, 0.06);
+      --card-shadow-hover: 0 1px 2px rgba(16, 24, 40, 0.06), 0 12px 36px rgba(16, 24, 40, 0.08);
+      --hf-amber: #f59e0b;
+      --hf-amber-soft: #fff7ed;
+      --accent: #2563eb;
+      --accent-soft: #eff6ff;
+      --diagram-bg: #0c1222;
+      --diagram-border: #1e293b;
+      --radius: 14px;
+      --radius-lg: 20px;
+      --font: "DM Sans", system-ui, -apple-system, sans-serif;
+      --font-display: "Fraunces", Georgia, serif;
+      --font-mono: "JetBrains Mono", ui-monospace, monospace;
+      --safe-top: env(safe-area-inset-top, 0px);
+      --safe-bottom: env(safe-area-inset-bottom, 0px);
+    }
+    * { box-sizing: border-box; }
+    html {
+      scroll-behavior: smooth;
+      scroll-padding-top: 112px;
+    }
+    body {
+      margin: 0;
+      font-family: var(--font);
+      color: var(--ink);
+      background: var(--space-bg);
+      min-height: 100vh;
+      min-height: 100dvh;
+      line-height: 1.55;
+      -webkit-font-smoothing: antialiased;
+    }
+    a { color: var(--accent); }
+    a:focus-visible, button:focus-visible, select:focus-visible, textarea:focus-visible {
+      outline: 2px solid var(--accent);
+      outline-offset: 2px;
+    }
+    .space-shell {
+      min-height: 100vh;
+      min-height: 100dvh;
+      display: flex;
+      flex-direction: column;
+    }
+    .space-banner {
+      position: sticky;
+      top: 0;
+      z-index: 40;
+      padding: calc(10px + var(--safe-top)) 16px 10px;
+      background: linear-gradient(180deg, rgba(255,255,255,0.96) 0%, rgba(250,251,254,0.94) 100%);
+      backdrop-filter: blur(12px);
+      -webkit-backdrop-filter: blur(12px);
+      border-bottom: 1px solid var(--space-border);
+      box-shadow: 0 4px 24px rgba(15, 23, 42, 0.04);
+    }
+    .space-banner-inner {
+      max-width: 1120px;
+      margin: 0 auto;
+      display: flex;
+      flex-wrap: wrap;
+      align-items: center;
+      justify-content: space-between;
+      gap: 12px 20px;
+    }
+    .space-brand {
+      display: flex;
+      align-items: center;
+      gap: 12px;
+      flex: 1 1 auto;
+      min-width: 0;
+    }
+    .space-logo {
+      width: 38px;
+      height: 38px;
+      border-radius: 10px;
+      background: linear-gradient(135deg, #fbbf24, #f59e0b);
+      box-shadow: 0 2px 8px rgba(245, 158, 11, 0.35);
+      flex-shrink: 0;
+    }
+    .space-brand h1 {
+      margin: 0;
+      font-family: var(--font-display);
+      font-size: 1.05rem;
+      font-weight: 600;
+      letter-spacing: -0.02em;
+      color: var(--ink);
+      line-height: 1.2;
+    }
+    .space-brand p {
+      margin: 2px 0 0;
+      font-size: 0.75rem;
+      color: var(--muted);
+      font-weight: 500;
+    }
+    .space-actions {
+      display: flex;
+      flex-wrap: wrap;
+      align-items: center;
+      gap: 8px;
+    }
+    .btn-ghost {
+      display: inline-flex;
+      align-items: center;
+      justify-content: center;
+      gap: 6px;
+      padding: 8px 14px;
+      font-size: 0.8125rem;
+      font-weight: 600;
+      font-family: inherit;
+      color: var(--ink-soft);
+      background: var(--card);
+      border: 1px solid var(--space-border-strong);
+      border-radius: 999px;
+      text-decoration: none;
+      cursor: pointer;
+      transition: border-color 0.15s, box-shadow 0.15s, background 0.15s;
+    }
+    .btn-ghost:hover {
+      border-color: var(--muted-light);
+      box-shadow: var(--card-shadow);
+    }
+    .btn-primary {
+      display: inline-flex;
+      align-items: center;
+      justify-content: center;
+      gap: 6px;
+      padding: 8px 16px;
+      font-size: 0.8125rem;
+      font-weight: 700;
+      font-family: inherit;
+      color: #1c1917;
+      background: linear-gradient(180deg, #fde68a, #fbbf24);
+      border: 1px solid #d97706;
+      border-radius: 999px;
+      text-decoration: none;
+      cursor: pointer;
+      box-shadow: 0 1px 0 rgba(255,255,255,0.5) inset;
+      transition: filter 0.15s, transform 0.1s;
+    }
+    .btn-primary:hover { filter: brightness(1.03); }
+    .btn-primary:active { transform: scale(0.98); }
+    .sticky-nav {
+      position: sticky;
+      top: calc(58px + var(--safe-top));
+      z-index: 30;
+      margin: 0 auto;
+      max-width: 1120px;
+      padding: 0 16px 8px;
+    }
+    .sticky-nav-inner {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 6px;
+      padding: 6px;
+      background: var(--card);
+      border: 1px solid var(--space-border);
+      border-radius: 999px;
+      box-shadow: var(--card-shadow);
+      width: fit-content;
+      max-width: 100%;
+    }
+    .sticky-nav a {
+      padding: 8px 14px;
+      font-size: 0.78rem;
+      font-weight: 600;
+      color: var(--muted);
+      text-decoration: none;
+      border-radius: 999px;
+      transition: background 0.15s, color 0.15s;
+      white-space: nowrap;
+    }
+    .sticky-nav a:hover {
+      color: var(--ink);
+      background: var(--space-bg);
+    }
+    .main {
+      flex: 1;
+      max-width: 1120px;
+      margin: 0 auto;
+      padding: 8px 16px calc(32px + var(--safe-bottom));
+      width: 100%;
+    }
+    .api-strip {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 8px;
+      margin-bottom: 20px;
+    }
+    .api-chip {
+      font-family: var(--font-mono);
+      font-size: 0.68rem;
+      font-weight: 500;
+      padding: 5px 10px;
+      border-radius: 8px;
+      background: var(--card);
+      border: 1px solid var(--space-border);
+      color: var(--ink-soft);
+    }
+    .api-chip span { color: var(--muted); margin-right: 6px; }
+    .section {
+      margin-bottom: 28px;
+    }
+    .section-id {
+      font-size: 0.65rem;
+      font-weight: 700;
+      letter-spacing: 0.18em;
+      text-transform: uppercase;
+      color: var(--hf-amber);
+      margin-bottom: 8px;
+    }
+    .hero-title {
+      font-family: var(--font-display);
+      font-weight: 600;
+      font-size: clamp(1.75rem, 4.2vw, 2.5rem);
+      line-height: 1.12;
+      margin: 0 0 12px;
+      letter-spacing: -0.02em;
+      color: var(--ink);
+    }
+    .hero-title em {
+      font-style: italic;
+      color: var(--accent);
+    }
+    .lede {
+      max-width: 54ch;
+      color: var(--muted);
+      font-size: 1rem;
+      margin: 0 0 18px;
+    }
+    .layer-strip {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 8px;
+      margin-bottom: 20px;
+    }
+    .layer {
+      font-size: 0.68rem;
+      font-weight: 700;
+      letter-spacing: 0.05em;
+      text-transform: uppercase;
+      padding: 6px 11px;
+      border-radius: 8px;
+      border: 1px solid var(--space-border);
+      background: var(--space-bg-elevated);
+      color: var(--muted);
+    }
+    .layer b { color: var(--ink); }
+    .panel {
+      background: var(--card);
+      border: 1px solid var(--space-border);
+      border-radius: var(--radius-lg);
+      padding: 20px;
+      box-shadow: var(--card-shadow);
+      margin-bottom: 20px;
+      transition: box-shadow 0.2s;
+    }
+    .panel:hover { box-shadow: var(--card-shadow-hover); }
+    .panel-header {
+      display: flex;
+      flex-wrap: wrap;
+      align-items: flex-start;
+      justify-content: space-between;
+      gap: 12px;
+      margin-bottom: 14px;
+    }
+    .panel-header h2 {
+      margin: 0;
+      font-size: 1.1rem;
+      font-weight: 700;
+      color: var(--ink);
+    }
+    .panel-header .caption {
+      margin: 0;
+      font-size: 0.8125rem;
+      color: var(--muted);
+      max-width: 38ch;
+      line-height: 1.45;
+    }
+    .diagram-wrap {
+      border-radius: var(--radius);
+      overflow: hidden;
+      background: var(--diagram-bg);
+      border: 1px solid var(--diagram-border);
+      box-shadow: inset 0 1px 0 rgba(255,255,255,0.06);
+    }
+    .diagram-wrap img {
+      display: block;
+      width: 100%;
+      height: auto;
+      max-height: min(68vh, 820px);
+      object-fit: contain;
+      object-position: center top;
+    }
+    .figure-footer {
+      display: flex;
+      flex-wrap: wrap;
+      justify-content: space-between;
+      gap: 10px;
+      padding-top: 14px;
+      margin-top: 4px;
+      font-size: 0.75rem;
+      color: var(--muted);
+    }
+    .legend {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 12px;
+    }
+    .legend span::before {
+      content: "";
+      display: inline-block;
+      width: 7px;
+      height: 7px;
+      border-radius: 2px;
+      margin-right: 5px;
+      vertical-align: middle;
+    }
+    .legend .l-api::before { background: #22c55e; }
+    .legend .l-env::before { background: #a78bfa; }
+    .legend .l-data::before { background: #fb923c; }
+    .legend .l-train::before { background: #2dd4bf; }
+    .badges {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 8px;
+    }
+    .badge {
+      font-size: 0.65rem;
+      font-weight: 700;
+      letter-spacing: 0.05em;
+      text-transform: uppercase;
+      padding: 5px 10px;
+      border-radius: 999px;
+      border: 1px solid var(--space-border);
+      color: var(--muted);
+      background: var(--space-bg-elevated);
+    }
+    .section-head {
+      margin-bottom: 14px;
+    }
+    .section-head h2 {
+      margin: 0 0 6px;
+      font-family: var(--font-display);
+      font-size: 1.35rem;
+      font-weight: 600;
+      color: var(--ink);
+    }
+    .section-head p {
+      margin: 0;
+      color: var(--muted);
+      font-size: 0.9375rem;
+    }
+    .grid {
+      display: grid;
+      gap: 16px;
+      grid-template-columns: 1fr;
+    }
+    @media (min-width: 860px) {
+      .grid.cols-2 { grid-template-columns: 1fr 1fr; }
+      .grid.cols-12 { grid-template-columns: repeat(12, 1fr); }
+      .span-4 { grid-column: span 4; }
+      .span-8 { grid-column: span 8; }
+    }
+    .play-card {
+      background: var(--card);
+      border: 1px solid var(--space-border);
+      border-radius: var(--radius-lg);
+      padding: 20px;
+      box-shadow: var(--card-shadow);
+    }
+    label {
+      display: block;
+      font-size: 0.7rem;
+      font-weight: 700;
+      letter-spacing: 0.07em;
+      text-transform: uppercase;
+      color: var(--muted);
+      margin-top: 14px;
+      margin-bottom: 6px;
+    }
+    label:first-of-type { margin-top: 0; }
+    select, textarea {
+      width: 100%;
+      font-family: inherit;
+      font-size: 0.9375rem;
+      border-radius: 10px;
+      border: 1px solid var(--space-border-strong);
+      background: var(--space-bg-elevated);
+      color: var(--ink);
+      padding: 12px 14px;
+      transition: border-color 0.15s, box-shadow 0.15s;
+    }
+    select:focus, textarea:focus {
+      outline: none;
+      border-color: var(--accent);
+      box-shadow: 0 0 0 3px var(--accent-soft);
+    }
+    textarea {
+      min-height: 140px;
+      resize: vertical;
+      font-family: var(--font-mono);
+      font-size: 0.8125rem;
+      line-height: 1.5;
+    }
+    .btn-action {
+      margin-top: 12px;
+      width: 100%;
+      min-height: 46px;
+      font-family: inherit;
+      font-size: 0.9375rem;
+      font-weight: 700;
+      cursor: pointer;
+      border-radius: 10px;
+      border: none;
+      color: #fff;
+      background: linear-gradient(135deg, #2563eb, #4f46e5);
+      box-shadow: 0 4px 14px rgba(37, 99, 235, 0.35);
+      transition: opacity 0.15s, transform 0.1s;
+    }
+    .btn-action:hover:not(:disabled) { filter: brightness(1.05); }
+    .btn-action:active:not(:disabled) { transform: scale(0.99); }
+    .btn-action:disabled {
+      opacity: 0.55;
+      cursor: not-allowed;
+    }
+    .session-pill {
+      display: inline-flex;
+      align-items: center;
+      gap: 8px;
+      font-size: 0.75rem;
+      color: var(--muted);
+      margin-bottom: 10px;
+      padding: 6px 12px;
+      background: var(--accent-soft);
+      border-radius: 999px;
+      border: 1px solid #bfdbfe;
+    }
+    .session-pill strong {
+      color: var(--accent);
+      font-family: var(--font-mono);
+      font-weight: 500;
+      font-size: 0.72rem;
+    }
+    code.pre {
+      display: block;
+      white-space: pre-wrap;
+      font-family: var(--font-mono);
+      font-size: 0.72rem;
+      line-height: 1.5;
+      background: #f8fafc;
+      border: 1px solid var(--space-border);
+      border-radius: 10px;
+      padding: 12px 14px;
+      color: #1e293b;
+      min-height: 72px;
+      max-height: 260px;
+      overflow: auto;
+    }
+    .proof-grid {
+      display: grid;
+      gap: 16px;
+      grid-template-columns: 1fr;
+    }
+    @media (min-width: 720px) {
+      .proof-grid { grid-template-columns: 1fr 1fr; }
+    }
+    .proof-card {
+      border-radius: var(--radius);
+      overflow: hidden;
+      border: 1px solid var(--space-border);
+      background: var(--card);
+      box-shadow: var(--card-shadow);
+    }
+    .proof-card figcaption {
+      padding: 10px 14px;
+      font-size: 0.8125rem;
+      color: var(--muted);
+      border-top: 1px solid var(--space-border);
+      background: var(--space-bg-elevated);
+    }
+    .proof-card img {
+      display: block;
+      width: 100%;
+      height: auto;
+    }
+    .link-list a {
+      color: var(--accent);
+      text-decoration: none;
+      font-weight: 600;
+      display: block;
+      padding: 10px 0;
+      border-bottom: 1px solid var(--space-border);
+      font-size: 0.9rem;
+    }
+    .link-list a:last-child { border-bottom: 0; }
+    .link-list a:hover { text-decoration: underline; }
+    .space-footer {
+      margin-top: auto;
+      padding: 20px 16px calc(16px + var(--safe-bottom));
+      border-top: 1px solid var(--space-border);
+      background: linear-gradient(180deg, var(--space-bg-elevated), var(--space-bg));
+    }
+    .space-footer-inner {
+      max-width: 1120px;
+      margin: 0 auto;
+      display: flex;
+      flex-wrap: wrap;
+      align-items: center;
+      justify-content: space-between;
+      gap: 12px;
+      font-size: 0.8125rem;
+      color: var(--muted);
+    }
+    .space-footer a { color: var(--muted); font-weight: 600; }
+    .space-footer a:hover { color: var(--ink); }
+    .blog-quote {
+      border-left: 4px solid #2563eb;
+      background: #eff6ff;
+      color: #1e3a8a;
+      padding: 10px 12px;
+      border-radius: 8px;
+      font-size: 0.9rem;
+      margin: 0 0 12px;
+    }
+    .blog-mini-grid {
+      display: grid;
+      grid-template-columns: repeat(3, minmax(0, 1fr));
+      gap: 8px;
+      margin: 0 0 12px;
+    }
+    .blog-mini {
+      background: #f8fafc;
+      border: 1px solid var(--space-border);
+      border-radius: 10px;
+      padding: 10px;
+      font-size: 0.82rem;
+      color: var(--muted);
+    }
+    .blog-mini b { color: var(--ink); display:block; font-size:0.98rem; margin-bottom: 2px; }
+    @media (max-width: 900px) {
+      .blog-mini-grid { grid-template-columns: 1fr; }
+    }
+  </style>
+</head>
+<body>
+  <div class="space-shell">
+    <header class="space-banner">
+      <div class="space-banner-inner">
+        <div class="space-brand">
+          <div class="space-logo" aria-hidden="true"></div>
+          <div>
+            <h1>SQL Debug Environment</h1>
+            <p>OpenEnv · FastAPI · Live SQL rewards</p>
+          </div>
+        </div>
+        <div class="space-actions">
+          <a class="btn-primary" href="/">Gradio UI</a>
+          <button type="button" class="btn-ghost" id="btnOpenTab" title="Opens this demo in a full browser tab">Open full page</button>
+          <a class="btn-ghost" href="https://huggingface.co/spaces/md896/sql-debug-env" target="_blank" rel="noopener">Space on Hub ↗</a>
+        </div>
+      </div>
+    </header>
+    <nav class="sticky-nav" aria-label="On-page navigation">
+      <div class="sticky-nav-inner">
+        <a href="#environment">Environment</a>
+        <a href="#first-training">First Training</a>
+        <a href="#playground">Playground</a>
+        <a href="#evidence">Evidence</a>
+        <a href="#repro">Reproduce</a>
+        <a href="/">Gradio</a>
+      </div>
+    </nav>
+    <main class="main">
+      <div class="api-strip" aria-label="Key API endpoints">
+        <span class="api-chip"><span>GET</span>/health</span>
+        <span class="api-chip"><span>GET</span>/tasks</span>
+        <span class="api-chip"><span>POST</span>/reset</span>
+        <span class="api-chip"><span>POST</span>/step</span>
+        <span class="api-chip"><span>POST</span>/step_with_review</span>
+        <span class="api-chip"><span>GET</span>/benchmark</span>
+      </div>
+      <section id="environment" class="section" aria-labelledby="env-title">
+        <p class="section-id">Space · Architecture</p>
+        <h2 class="hero-title" id="env-title">Environment first — <em>how</em> the agent sees the world.</h2>
+        <p class="lede">
+          This Space hosts the same HTTP API your trainer calls: sessions, typed observations, SQLite-backed tasks, and a decomposed reward. Below is the end-to-end map judges can skim in seconds.
+        </p>
+        <div class="layer-strip" aria-hidden="true">
+          <span class="layer"><b>Client</b> / agent</span>
+          <span class="layer"><b>API</b> session + JSON</span>
+          <span class="layer"><b>Env</b> SQLDebugEnv</span>
+          <span class="layer"><b>Data</b> tasks + SQLite</span>
+          <span class="layer"><b>Train</b> GRPO + artifacts</span>
+        </div>
+        <div class="panel">
+          <div class="panel-header">
+            <h2>Environment visualization</h2>
+            <p class="caption">Runtime flow (solid) vs training and ops (dashed). Reviewer-guarded path optional for safer rollouts.</p>
+          </div>
+          <div class="diagram-wrap">
+            <img src="/static/environment-workflow.png" alt="End-to-end workflow: Client, FastAPI, environment core, data and reward layer, training and deployment." width="1600" height="900" loading="eager" decoding="async" />
+          </div>
+          <div class="figure-footer">
+            <div class="legend">
+              <span class="l-api">API</span>
+              <span class="l-env">Env core</span>
+              <span class="l-data">DB / tasks / reward</span>
+              <span class="l-train">Training &amp; Space</span>
+            </div>
+            <span>sql-debug-env workflow</span>
+          </div>
+        </div>
+        <div class="badges">
+          <span class="badge">OpenEnv</span>
+          <span class="badge">TRL · GRPO</span>
+          <span class="badge">Live rewards</span>
+          <span class="badge">Reviewer path</span>
+        </div>
+      </section>
+      <section id="first-training" class="section" aria-labelledby="first-training-title">
+        <div class="section-head">
+          <p class="section-id">Training · First Context</p>
+          <h2 id="first-training-title">Start with the first bridge run</h2>
+          <p>This is the exact first training context you shared: dependency bootstrap, W&amp;B tracking, then benchmark/eval steps.</p>
+        </div>
+        <div class="grid cols-12">
+          <div class="play-card span-4">
+            <div class="link-list">
+              <a href="https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5#scrollTo=j-9MptXvmPk8" target="_blank" rel="noopener">First training context (Colab anchor)</a>
+              <a href="https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5#scrollTo=x5YuvatGyyu_" target="_blank" rel="noopener">Full training notebook anchor</a>
+              <a href="https://wandb.ai/mdayanbag-pesitm/sql-debug-grpo-best-budget/workspace?nw=nwusermdayanbag" target="_blank" rel="noopener">W&amp;B workspace: sql-debug-grpo-best-budget</a>
+              <a href="https://huggingface.co/spaces/md896/sql-debug-env/tree/main/artifacts/runs/20260426-064318-sample-rewards-32eval" target="_blank" rel="noopener">Sample rewards (32-eval) artifacts</a>
+              <a href="https://huggingface.co/md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2" target="_blank" rel="noopener">Model card (winner)</a>
+            </div>
+          </div>
+          <div class="play-card span-8">
+            <label>First training context code</label>
+            <code class="pre"># SQL Debug Env: FINAL REAL-WORLD BRIDGE
+import os
+print("Checking libraries...")
+os.system("pip install trl accelerate wandb -U")
+import httpx
+import torch
+import wandb
+# W&B workspace: https://wandb.ai/mdayanbag-pesitm/sql-debug-grpo-best-budget/workspace?nw=nwusermdayanbag</code>
+          </div>
+        </div>
+      </section>
+      <section id="playground" class="section" aria-labelledby="play-title">
+        <div class="section-head">
+          <p class="section-id">Live · Playground</p>
+          <h2 id="play-title">Try <code style="font-family:var(--font-mono);font-size:0.85em;background:#f1f5f9;padding:2px 6px;border-radius:4px">/reset</code> and <code style="font-family:var(--font-mono);font-size:0.85em;background:#f1f5f9;padding:2px 6px;border-radius:4px">/step</code> from the browser</h2>
+          <p>Use the same <strong>X-Session-Id</strong> header on every call (here: <code style="font-family:var(--font-mono);font-size:0.85em">demo-session</code>).</p>
+        </div>
+        <div class="grid cols-2">
+          <div class="play-card">
+            <label for="taskId">Task</label>
+            <select id="taskId" aria-label="Select task">
+              <option value="easy_syntax_fix">easy_syntax_fix</option>
+              <option value="medium_logic_fix">medium_logic_fix</option>
+              <option value="hard_multi_bug">hard_multi_bug</option>
+              <option value="hard_finance_explosion">hard_finance_explosion</option>
+            </select>
+            <button type="button" class="btn-action" id="btnReset" onclick="resetTask()">Reset task</button>
+            <label for="query">Candidate SQL</label>
+            <textarea id="query" placeholder="SELECT ..." aria-label="SQL query"></textarea>
+            <button type="button" class="btn-action" id="btnSubmit" onclick="submitQuery()">Submit query</button>
+          </div>
+          <div class="play-card">
+            <div class="session-pill">Session <strong>demo-session</strong></div>
+            <label>Task observation</label>
+            <code id="observation" class="pre">Run “Reset task” to load the broken query and observation JSON.</code>
+            <label style="margin-top:14px">Step result</label>
+            <code id="result" class="pre">Submit a query to see reward, done, and info.</code>
+          </div>
+        </div>
+      </section>
+      <section id="evidence" class="section" aria-labelledby="evidence-title">
+        <div class="section-head">
+          <p class="section-id">Evidence · Artifacts</p>
+          <h2 id="evidence-title">Training plots from real runs</h2>
+          <p>Regenerate with <code style="font-family:var(--font-mono);font-size:0.85em">presentation_graphs.py</code>; commit PNGs under <code style="font-family:var(--font-mono);font-size:0.85em">server/static/</code>.</p>
+        </div>
+        <div class="proof-grid">
+          <figure class="proof-card">
+            <img src="/static/proof-combo.png" alt="Presentation combo chart from training run" width="1200" height="800" loading="lazy" decoding="async" />
+            <figcaption>Presentation combo — logged metrics.</figcaption>
+          </figure>
+          <figure class="proof-card">
+            <img src="/static/proof-distribution-shift.png" alt="Reward distribution shift" width="1200" height="800" loading="lazy" decoding="async" />
+            <figcaption>Per-sample reward shift (baseline vs trained).</figcaption>
+          </figure>
+        </div>
+        <div class="link-list" style="margin-top:12px">
+          <a href="/static/training_reward_curve_final.png" target="_blank" rel="noopener">training_reward_curve_final.png</a>
+          <a href="/static/training_diagnostics_dual_axis_final.png" target="_blank" rel="noopener">training_diagnostics_dual_axis_final.png</a>
+          <a href="/static/baseline_vs_trained_by_task_final.png" target="_blank" rel="noopener">baseline_vs_trained_by_task_final.png</a>
+          <a href="/static/task_delta_post_minus_base_final.png" target="_blank" rel="noopener">task_delta_post_minus_base_final.png</a>
+          <a href="/static/reward_distribution_shift_red_green_final.png" target="_blank" rel="noopener">reward_distribution_shift_red_green_final.png</a>
+          <a href="/static/presentation_combo_final.png" target="_blank" rel="noopener">presentation_combo_final.png</a>
+          <a href="/static/benchmark_style_summary_final.png" target="_blank" rel="noopener">benchmark_style_summary_final.png</a>
+          <a href="/static/checkpoint_leaderboard_step_vs_reward_final.png" target="_blank" rel="noopener">checkpoint_leaderboard_step_vs_reward_final.png</a>
+          <a href="/static/cost_vs_performance_final.png" target="_blank" rel="noopener">cost_vs_performance_final.png</a>
+        </div>
+      </section>
+      <section id="repro" class="section">
+        <div class="grid cols-12">
+          <div class="play-card span-4">
+            <div class="section-head" style="margin-bottom:10px">
+              <p class="section-id">Reproduce</p>
+              <h2 style="font-family:var(--font-display);font-size:1.15rem;margin:0;font-weight:600">Runs &amp; assets</h2>
+            </div>
+            <div class="link-list">
+              <a href="https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5#scrollTo=x5YuvatGyyu_" target="_blank" rel="noopener">Colab training notebook</a>
+              <a href="https://huggingface.co/spaces/md896/sql-debug-env/tree/main/artifacts/runs/20260426-060502-final-pass-32eval" target="_blank" rel="noopener">Eval artifacts (32-run)</a>
+              <a href="https://huggingface.co/md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2" target="_blank" rel="noopener">Model card</a>
+              <a href="/benchmark" target="_blank" rel="noopener">Benchmark JSON</a>
+              <a href="/health" target="_blank" rel="noopener">Health</a>
+            </div>
+          </div>
+          <div class="play-card span-8">
+            <div class="section-head" style="margin-bottom:10px">
+              <p class="section-id">Engineering Notes</p>
+              <h2 style="font-family:var(--font-display);font-size:1.15rem;margin:0;font-weight:600">Why I picked SQL debugging and why this architecture exists</h2>
+            </div>
+            <div class="blog-quote">
+              “The goal is not to generate beautiful SQL text. The goal is to produce SQL fixes that survive execution, repeatedly, under changing runtime conditions.”
+            </div>
+            <div class="blog-mini-grid">
+              <div class="blog-mini"><b>0.5B -> 7B</b>Model track from first bridge run to main baseline.</div>
+              <div class="blog-mini"><b>32-run eval</b>Final artifact path with sample rewards and run logs.</div>
+              <div class="blog-mini"><b>Execution-first</b>Reward is computed from runtime outcomes, not prompt resemblance.</div>
+            </div>
+            <p style="color:var(--muted);margin:0 0 12px;font-size:0.9375rem">
+              The motive for this project was not to build another text-to-SQL demo. The motive was reliability. SQL bugs are expensive because they fail late:
+              queries can look clean in review but break under real schema constraints, data skew, or join cardinality shifts. I picked this problem because it sits at the
+              boundary between language modeling and systems engineering. If the agent improves here, it is learning runtime correctness, not cosmetic fluency.
+            </p>
+            <p style="color:var(--muted);margin:0 0 12px;font-size:0.9375rem">
+              The architecture follows an OpenEnv-style contract:
+              <code>reset -&gt; observation</code> and <code>step(action) -&gt; observation, reward, done, info</code>.
+              Each episode runs on isolated in-memory SQLite state, deterministic task grading, and execution-grounded rewards. This pushes the model toward behaviors that survive runtime:
+              valid table references, stable aggregations, and join logic that does not collapse in edge cases.
+            </p>
+            <code class="pre">Conceptual reward:
+R_t = w_c*C_t + w_e*E_t + w_p*P_t + w_s*S_t - lambda*Penalty_t
+Objective:
+J(pi) = E_{tau ~ pi}[sum_{t=0..T} gamma^t * R_t]</code>
+            <p style="color:var(--muted);margin:0 0 12px;font-size:0.9375rem">
+              The technical design makes debugging measurable. Session state exposes observations, action history, and reward trajectories.
+              The reviewer-gated path adds risk control for unsafe submissions while preserving gradient signal (instead of hard-failing every risky step).
+              This gives the policy useful consequences: what failed, why it failed, and how far a candidate moved toward a valid fix.
+            </p>
+            <code class="pre">Data snapshot shown on this page:
+- Spider-style industry baseline: 48.2%
+- Qwen-7B base: 52.4%
+- RL agent headline: 78.5%
+- Performance leap view: 0.0% -> 25.0%
+- Hard evidence: 32-run eval + sample reward artifacts</code>
+            <p style="color:var(--muted);margin:12px 0 12px;font-size:0.9375rem">
+              Another deliberate choice is traceability. This page is an evidence chain: first training context, live interaction, then artifact-backed plots.
+              If a metric appears, it should map to concrete run folders, reward JSON files, and checkpoint lineage.
+            </p>
+            <p style="color:var(--muted);margin:0 0 12px;font-size:0.9375rem">
+              Industry and research point the same direction: robust text-to-SQL requires context quality, intent handling, dialect robustness, and execution safeguards.
+              Enterprise SQL debugging remains difficult when feedback is detached from runtime behavior. The objective here is to close that gap with a reproducible,
+              execution-grounded learning loop.
+            </p>
+            <div class="link-list" style="margin-top:12px">
+              <a href="https://cloud.google.com/blog/products/databases/techniques-for-improving-text-to-sql" target="_blank" rel="noopener">Google Cloud: techniques for improving text-to-SQL</a>
+              <a href="https://arxiv.org/abs/2601.18119" target="_blank" rel="noopener">OurBench / Squirrel: enterprise SQL debugging benchmark</a>
+              <a href="https://karpathy.github.io/2014/09/02/what-i-learned-from-competing-against-a-convnet-on-imagenet/" target="_blank" rel="noopener">Writing inspiration: Karpathy field-notes style</a>
+              <a href="#" target="_blank" rel="noopener">Final public blog URL (replace)</a>
+              <a href="#" target="_blank" rel="noopener">Slides (URL)</a>
+              <a href="#" target="_blank" rel="noopener">Demo video (URL)</a>
+            </div>
+          </div>
+        </div>
+      </section>
+    </main>
+    <footer class="space-footer">
+      <div class="space-footer-inner">
+        <span>Custom Space UI · FastAPI <code style="font-family:var(--font-mono);font-size:0.75em">/demo</code></span>
+        <span>
+          <a href="https://huggingface.co/docs/hub/spaces" target="_blank" rel="noopener">Spaces docs</a>
+          ·
+          <a href="https://huggingface.co/spaces/md896/sql-debug-env/tree/main" target="_blank" rel="noopener">Files &amp; versions</a>
+        </span>
+      </div>
+    </footer>
+  </div>
+  <script>
+    (function () {
+      var btn = document.getElementById("btnOpenTab");
+      if (btn) {
+        btn.addEventListener("click", function () {
+          try {
+            window.open(window.location.href, "_blank", "noopener,noreferrer");
+          } catch (e) {
+            window.location.href = window.location.href;
+          }
+        });
+      }
+    })();
+    const sessionId = "demo-session";
+    function setLoading(which, on) {
+      var el = document.getElementById(which);
+      if (!el) return;
+      el.disabled = on;
+      if (on && !el.dataset.label) el.dataset.label = el.textContent;
+      el.textContent = on ? "Please wait…" : (el.dataset.label || el.textContent);
+    }
+    async function resetTask() {
+      setLoading("btnReset", true);
+      try {
+        const taskId = document.getElementById("taskId").value;
+        const resp = await fetch("/reset", {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+            "X-Session-Id": sessionId
+          },
+          body: JSON.stringify({ task_id: taskId })
+        });
+        const data = await resp.json();
+        document.getElementById("observation").textContent = JSON.stringify(data, null, 2);
+        const broken = data && data.observation && data.observation.original_query;
+        document.getElementById("query").value = broken || "";
+      } finally {
+        setLoading("btnReset", false);
+      }
+    }
+    async function submitQuery() {
+      setLoading("btnSubmit", true);
+      try {
+        const query = document.getElementById("query").value;
+        const payload = {
+          action: {
+            action_type: "submit_query",
+            query: query
+          }
+        };
+        const resp = await fetch("/step", {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+            "X-Session-Id": sessionId
+          },
+          body: JSON.stringify(payload)
+        });
+        const data = await resp.json();
+        document.getElementById("result").textContent = JSON.stringify(data, null, 2);
+      } finally {
+        setLoading("btnSubmit", false);
+      }
+    }
+  </script>
+</body>
+</html>

server/gradio_ui.py ADDED Viewed

	@@ -0,0 +1,710 @@

+"""
+Single-page Gradio UI for the Hugging Face Space (same process as the OpenEnv FastAPI API).
+Playground uses POST /reset and POST /step via loopback HTTP with X-Session-Id.
+"""
+from __future__ import annotations
+import json
+import os
+import uuid
+from pathlib import Path
+from typing import Any, Optional, Tuple
+import httpx
+COLAB_FIRST_TRAINING = (
+    "https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5"
+    "#scrollTo=j-9MptXvmPk8"
+)
+COLAB_TRAINING_ROOT = (
+    "https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5"
+    "#scrollTo=x5YuvatGyyu_"
+)
+HF_SPACE = "https://huggingface.co/spaces/md896/sql-debug-env"
+HF_SAMPLE_REWARDS = (
+    "https://huggingface.co/spaces/md896/sql-debug-env/tree/main/"
+    "artifacts/runs/20260426-064318-sample-rewards-32eval"
+)
+HF_EVAL_32 = (
+    "https://huggingface.co/spaces/md896/sql-debug-env/tree/main/"
+    "artifacts/runs/20260426-060502-final-pass-32eval"
+)
+HF_MODEL = "https://huggingface.co/md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2"
+WANDB_TRAINING_RUN = "https://wandb.ai/mdayanbag-pesitm/sql-debug-grpo-best-budget/workspace?nw=nwusermdayanbag"
+GCLOUD_TEXT2SQL_BLOG = "https://cloud.google.com/blog/products/databases/techniques-for-improving-text-to-sql"
+OURBENCH_PAPER = "https://arxiv.org/abs/2601.18119"
+KARPATHY_STYLE_REFERENCE = "https://karpathy.github.io/2014/09/02/what-i-learned-from-competing-against-a-convnet-on-imagenet/"
+PREDEFINED_QUERIES: dict[str, list[tuple[str, str]]] = {
+    "easy_syntax_fix": [
+        ("Broken baseline: typo table", "SELECT * FROM userss;"),
+        ("Simple lookup", "SELECT id, name FROM users ORDER BY id LIMIT 10;"),
+        ("Potential invalid write", "UPDATE users SET name='test';"),
+    ],
+    "medium_logic_fix": [
+        ("Broken: missing GROUP BY", "SELECT department, COUNT(*) FROM employees;"),
+        ("Revenue by month", "SELECT strftime('%Y-%m', order_date) AS ym, SUM(amount) FROM orders GROUP BY ym ORDER BY ym;"),
+        ("Top entities", "SELECT customer_id, SUM(total) AS spend FROM invoices GROUP BY customer_id ORDER BY spend DESC LIMIT 5;"),
+    ],
+    "hard_multi_bug": [
+        ("Broken join alias", "SELECT u.name, o.total FROM users u JOIN orders o ON user.id = o.user_id;"),
+        ("Join + aggregate", "SELECT p.category, AVG(p.price) AS avg_price FROM products p GROUP BY p.category ORDER BY avg_price DESC;"),
+        ("Nested query", "SELECT name FROM customers WHERE id IN (SELECT customer_id FROM orders GROUP BY customer_id HAVING COUNT(*) > 2);"),
+    ],
+    "hard_finance_explosion": [
+        ("Broken finance calc", "SELECT account_id, SUM(amount) / COUNT(*) AS risk FROM txn GROUP BY account;"),
+        ("PnL-style aggregate", "SELECT symbol, SUM(CASE WHEN side='BUY' THEN -notional ELSE notional END) AS pnl FROM trades GROUP BY symbol ORDER BY pnl DESC;"),
+        ("Daily exposure", "SELECT date(trade_ts) AS d, SUM(abs(notional)) AS exposure FROM trades GROUP BY d ORDER BY d;"),
+    ],
+}
+GRADIO_CSS = """
+:root {
+  --sde-ink: #0f172a;
+  --sde-muted: #64748b;
+  --sde-line: #e2e8f0;
+  --sde-card: #ffffff;
+  --sde-glow:
+    radial-gradient(120% 140% at 0% 0%, rgba(45, 212, 191, 0.22) 0%, rgba(45, 212, 191, 0) 52%),
+    radial-gradient(120% 140% at 100% 0%, rgba(147, 197, 253, 0.22) 0%, rgba(147, 197, 253, 0) 55%),
+    linear-gradient(132deg, #0f172a 0%, #1e293b 45%, #0f766e 100%);
+}
+.gradio-container { max-width: 1180px !important; margin-left: auto !important; margin-right: auto !important; }
+.sde-hero-wrap {
+  background: var(--sde-glow);
+  color: #f8fafc;
+  border-radius: 20px;
+  padding: 1.75rem 1.5rem 1.5rem;
+  margin-bottom: 1.25rem;
+  border: 1px solid rgba(148, 163, 184, 0.24);
+  box-shadow: 0 18px 40px rgba(15, 23, 42, 0.20), inset 0 1px 0 rgba(255, 255, 255, 0.12);
+}
+.sde-hero-wrap h1 { margin: 0 0 0.35rem 0; font-size: 1.85rem; letter-spacing: -0.03em; }
+.sde-hero-wrap p { margin: 0; color: #e2e8f0; font-size: 0.95rem; line-height: 1.5; }
+.sde-pill-row { display: flex; flex-wrap: wrap; gap: 0.5rem; margin-top: 1rem; }
+.sde-pill {
+  display: inline-block;
+  padding: 0.35rem 0.75rem;
+  border-radius: 999px;
+  font-size: 0.72rem;
+  font-weight: 700;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
+  background: rgba(15, 23, 42, 0.26);
+  border: 1px solid rgba(226, 232, 240, 0.34);
+  color: #f8fafc;
+}
+.sde-section-title {
+  font-size: 1.05rem;
+  font-weight: 700;
+  color: var(--sde-ink);
+  margin: 1.5rem 0 0.75rem 0;
+  letter-spacing: -0.02em;
+}
+.sde-link-row a {
+  color: #2563eb !important;
+  font-weight: 600;
+  margin-right: 1rem;
+}
+.sde-kpi-grid {
+  display: grid;
+  grid-template-columns: repeat(4, minmax(0, 1fr));
+  gap: 0.75rem;
+  margin: 0.5rem 0 1rem;
+}
+.sde-kpi {
+  background: #ffffff;
+  border: 1px solid #dbe3f0;
+  border-radius: 14px;
+  padding: 0.85rem 0.95rem;
+  box-shadow: 0 10px 24px rgba(15, 23, 42, 0.06);
+}
+.sde-kpi .v {
+  font-size: 1.25rem;
+  font-weight: 800;
+  letter-spacing: -0.02em;
+  color: #0f172a;
+}
+.sde-kpi .k {
+  margin-top: 0.15rem;
+  font-size: 0.73rem;
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+  color: #64748b;
+}
+.sde-callout {
+  border-left: 4px solid #2563eb;
+  background: #eff6ff;
+  color: #1e3a8a;
+  padding: 0.7rem 0.8rem;
+  border-radius: 8px;
+  margin: 0.5rem 0 0.75rem;
+  font-size: 0.86rem;
+}
+@media (max-width: 900px) {
+  .sde-kpi-grid { grid-template-columns: repeat(2, minmax(0, 1fr)); }
+}
+"""
+def _api_base() -> str:
+    return os.environ.get(
+        "INTERNAL_API_BASE",
+        f"http://127.0.0.1:{os.environ.get('PORT', '7860')}",
+    ).rstrip("/")
+def _blog_url() -> str:
+    return (os.environ.get("BLOG_URL") or "").strip()
+def _http() -> httpx.Client:
+    return httpx.Client(timeout=120.0)
+def _img_path(static_dir: Path, *names: str) -> Optional[str]:
+    for n in names:
+        p = static_dir / n
+        if p.is_file():
+            return str(p.resolve())
+    return None
+def _preset_options(task_id: str) -> list[str]:
+    return [name for name, _ in PREDEFINED_QUERIES.get(task_id, [])]
+def _preset_query(task_id: str, preset_name: str) -> str:
+    for name, query in PREDEFINED_QUERIES.get(task_id, []):
+        if name == preset_name:
+            return query
+    return ""
+def _safe_reward(value: Any) -> float:
+    try:
+        return float(value)
+    except Exception:
+        return 0.0
+def build_blocks(static_dir: Path) -> Any:
+    import gradio as gr
+    wf = _img_path(static_dir, "environment-workflow.png")
+    chart_leap = _img_path(static_dir, "chart-performance-leap.png", "hero_performance_leap.png")
+    chart_dual = _img_path(static_dir, "chart-comparison-shift.png", "hero_dual_benchmark.png")
+    chart_spider = _img_path(static_dir, "chart-spider-benchmark.png", "hero_spider_sota.png")
+    proof_combo = _img_path(static_dir, "proof-combo.png")
+    proof_dist = _img_path(static_dir, "proof-distribution-shift.png")
+    final_gallery_paths = [
+        "training_reward_curve_final.png",
+        "training_diagnostics_dual_axis_final.png",
+        "baseline_vs_trained_by_task_final.png",
+        "task_delta_post_minus_base_final.png",
+        "reward_distribution_shift_red_green_final.png",
+        "presentation_combo_final.png",
+        "benchmark_style_summary_final.png",
+        "checkpoint_leaderboard_step_vs_reward_final.png",
+        "cost_vs_performance_final.png",
+    ]
+    final_gallery: list[tuple[str, str]] = []
+    for filename in final_gallery_paths:
+        path = _img_path(static_dir, filename)
+        if path:
+            title = filename.replace("_final.png", "").replace("_", " ").title()
+            final_gallery.append((path, title))
+    blog = _blog_url()
+    blog_md = (
+        f"### Blog\n[Read the write-up]({blog})"
+        if blog
+        else "### Blog\nAdd a **Space secret** named `BLOG_URL` with your post URL (e.g. Medium, personal site, or Hugging Face blog)."
+    )
+    task_choices = [
+        "easy_syntax_fix",
+        "medium_logic_fix",
+        "hard_multi_bug",
+        "hard_finance_explosion",
+    ]
+    def reset_fn(
+        task_id: str, session_id: Optional[str]
+    ) -> Tuple[str, str, str, str]:
+        sid = session_id or str(uuid.uuid4())
+        try:
+            with _http() as client:
+                r = client.post(
+                    f"{_api_base()}/reset",
+                    json={"task_id": task_id},
+                    headers={"X-Session-Id": sid},
+                )
+                r.raise_for_status()
+                data = r.json()
+        except Exception as e:
+            err = {"error": str(e), "hint": "Is the server listening on PORT?"}
+            return json.dumps(err, indent=2), "", sid, f"Session: `{sid}` · **error**"
+        obs = json.dumps(data, indent=2)
+        q = (data.get("observation") or {}).get("original_query") or ""
+        return obs, q, sid, f"Session: `{sid}`"
+    def submit_fn(
+        query: str, session_id: Optional[str]
+    ) -> Tuple[str, str]:
+        if not session_id:
+            return (
+                json.dumps({"error": "Click “Reset task” first to create a session."}, indent=2),
+                "",
+            )
+        payload = {"action": {"action_type": "submit_query", "query": query or ""}}
+        try:
+            with _http() as client:
+                r = client.post(
+                    f"{_api_base()}/step",
+                    json=payload,
+                    headers={"X-Session-Id": session_id},
+                )
+                r.raise_for_status()
+                data = r.json()
+        except httpx.HTTPStatusError as e:
+            try:
+                detail = e.response.json()
+            except Exception:
+                detail = e.response.text
+            return json.dumps({"error": str(e), "detail": detail}, indent=2), ""
+        except Exception as e:
+            return json.dumps({"error": str(e)}, indent=2), ""
+        out = json.dumps(data, indent=2)
+        reward = data.get("reward")
+        done = data.get("done")
+        return out, f"**reward** `{reward}` · **done** `{done}`"
+    def run_preset_suite(
+        task_id: str, session_id: Optional[str]
+    ) -> Tuple[str, str, str, str]:
+        sid = session_id or str(uuid.uuid4())
+        presets = PREDEFINED_QUERIES.get(task_id, [])
+        if not presets:
+            return "No presets for selected task.", "{}", sid, f"Session: `{sid}`"
+        rows: list[str] = []
+        rewards: list[float] = []
+        done_count = 0
+        error_count = 0
+        with _http() as client:
+            for idx, (name, query) in enumerate(presets, start=1):
+                try:
+                    client.post(
+                        f"{_api_base()}/reset",
+                        json={"task_id": task_id},
+                        headers={"X-Session-Id": sid},
+                    ).raise_for_status()
+                    step_resp = client.post(
+                        f"{_api_base()}/step",
+                        json={"action": {"action_type": "submit_query", "query": query}},
+                        headers={"X-Session-Id": sid},
+                    )
+                    step_resp.raise_for_status()
+                    data = step_resp.json()
+                    reward = _safe_reward(data.get("reward"))
+                    done = bool(data.get("done"))
+                    info = data.get("info") or {}
+                    label = "pass" if reward >= 0.5 else "check"
+                    rewards.append(reward)
+                    done_count += int(done)
+                    note = "review_rejected" if info.get("review_rejected") else ""
+                    rows.append(
+                        f"| {idx} | {name} | `{reward:.3f}` | `{done}` | {label} {note} |"
+                    )
+                except Exception as e:
+                    error_count += 1
+                    rows.append(
+                        f"| {idx} | {name} | `0.000` | `False` | error: {str(e)[:120]} |"
+                    )
+        avg_reward = (sum(rewards) / len(rewards)) if rewards else 0.0
+        max_reward = max(rewards) if rewards else 0.0
+        min_reward = min(rewards) if rewards else 0.0
+        suite_md = (
+            "#### Preset suite report\n"
+            "| # | Preset | Reward | Done | Note |\n"
+            "|---|---|---:|:---:|---|\n"
+            + "\n".join(rows)
+            + "\n\n"
+            + f"**Summary:** avg reward `{avg_reward:.3f}` · min `{min_reward:.3f}` · max `{max_reward:.3f}` · "
+              f"done count `{done_count}` · errors `{error_count}`"
+        )
+        suite_json = json.dumps(
+            {
+                "task_id": task_id,
+                "session_id": sid,
+                "n_presets": len(presets),
+                "avg_reward": round(avg_reward, 4),
+                "min_reward": round(min_reward, 4),
+                "max_reward": round(max_reward, 4),
+                "done_count": done_count,
+                "error_count": error_count,
+            },
+            indent=2,
+        )
+        return suite_md, suite_json, sid, f"Session: `{sid}`"
+    with gr.Blocks(
+        title="SQL Debug Environment",
+        analytics_enabled=False,
+    ) as demo:
+        gr.HTML(
+            """
+<div class="sde-hero-wrap">
+  <h1>SQL Debug Environment</h1>
+  <p>OpenEnv-compliant SQL repair · live SQLite rewards · TRL / GRPO training on this same Space.
+     One page: benchmarks, artifacts, architecture, and a live playground.</p>
+  <div class="sde-pill-row">
+    <span class="sde-pill">OpenEnv</span>
+    <span class="sde-pill">FastAPI</span>
+    <span class="sde-pill">Gradio</span>
+    <span class="sde-pill">TRL · GRPO</span>
+  </div>
+</div>
+            """.strip()
+        )
+        gr.Markdown(
+            "### First context: training proof first\n"
+            f"- **Field-notes writing style reference:** [Karpathy post]({KARPATHY_STYLE_REFERENCE})\n"
+            f"- **First training notebook (auto-install cell):** [Open in Colab]({COLAB_FIRST_TRAINING})\n"
+            f"- **Full training Colab (root anchor):** [Open in Colab]({COLAB_TRAINING_ROOT})\n"
+            f"- **Weights & Biases (project workspace):** [Open dashboard]({WANDB_TRAINING_RUN})\n"
+            f"- **Sample-reward eval artifacts (32-run JSON on Hub):** [Browse files]({HF_SAMPLE_REWARDS})\n"
+            f"- **Earlier 32-eval pass folder:** [Browse files]({HF_EVAL_32})\n"
+            f"- **Trained model card:** [md896/sql-debug-agent…]({HF_MODEL})\n"
+            f"- **This Space:** [{HF_SPACE}]({HF_SPACE})"
+        )
+        gr.HTML(
+            """
+<div class="sde-kpi-grid">
+  <div class="sde-kpi"><div class="v">0.5B → 7B</div><div class="k">Model progression</div></div>
+  <div class="sde-kpi"><div class="v">32-run eval</div><div class="k">Final artifact pass</div></div>
+  <div class="sde-kpi"><div class="v">78.5%</div><div class="k">Spider-style headline</div></div>
+  <div class="sde-kpi"><div class="v">Execution reward</div><div class="k">Primary training signal</div></div>
+</div>
+            """.strip()
+        )
+        gr.HTML(
+            '<div class="sde-callout"><strong>Notebook vibe:</strong> this page is intentionally written as field notes + reproducible cells, not a static deck. Every number should map to an artifact.</div>'
+        )
+        gr.Code(
+            label="First training context cell (from your Colab)",
+            language="python",
+            interactive=False,
+            value=(
+                "# 🏆 SQL Debug Env: FINAL REAL-WORLD BRIDGE\n"
+                "import os\n"
+                "print('📦 Checking libraries...')\n"
+                "os.system('pip install trl accelerate wandb -U')\n\n"
+                "import httpx\n"
+                "import torch\n"
+            ),
+            lines=8,
+        )
+        gr.Markdown(
+            "### Lab notebook stats (TL;DR)\n"
+            "- First training pass started with **Qwen/Qwen2.5-Coder-0.5B-Instruct** for environment wiring and fast iteration.\n"
+            "- Main training/eval track used **Qwen/Qwen2.5-Coder-7B-Instruct** with execution-grounded reward loops.\n"
+            "- Final reporting is tied to run artifacts and static charts committed under `server/static/`."
+        )
+        gr.Markdown(
+            "| Track | Model | Role | Evidence |\n"
+            "|---|---|---|---|\n"
+            "| First bridge run | Qwen/Qwen2.5-Coder-0.5B-Instruct | Fast validation of API/reward loop and notebook flow | First training context + W&B run |\n"
+            "| Base reference | Qwen/Qwen2.5-Coder-7B-Instruct | Baseline behavior before RL updates | Spider/comparison charts |\n"
+            "| Current agent | RL-updated checkpoint on 7B track | Improved execution-grounded SQL fixing | HF model + eval artifacts + sample rewards |\n"
+        )
+        gr.Markdown(
+            "### Run timeline (quick history)\n"
+            "| Stage | What happened | Why it matters |\n"
+            "|---|---|---|\n"
+            "| Bridge run | Fast setup with **Qwen2.5-Coder-0.5B** | Validated API + reward wiring quickly |\n"
+            "| Main baseline | Moved to **Qwen2.5-Coder-7B-Instruct** | Better capacity for SQL structure + joins |\n"
+            "| RL iterations | Session-consistent reset/step reward loop | Converted text quality into runtime behavior |\n"
+            "| Hard-test reporting | `presentation_graphs_out_final` committed under `server/static/` | Keeps evaluation auditable on-page |\n"
+        )
+        gr.Code(
+            label="Notebook cell: baseline evaluator sketch (7B track)",
+            language="python",
+            interactive=False,
+            value=(
+                "from transformers import AutoTokenizer, AutoModelForCausalLM\n"
+                "MODEL = 'Qwen/Qwen2.5-Coder-7B-Instruct'\n"
+                "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n"
+                "model = AutoModelForCausalLM.from_pretrained(MODEL, device_map='auto')\n"
+                "# generate SQL -> POST /reset -> POST /step -> score by execution reward\n"
+            ),
+            lines=7,
+        )
+        gr.Code(
+            label="Notebook cell: regenerate presentation plots from real artifacts",
+            language="shell",
+            interactive=False,
+            value=(
+                "python presentation_graphs.py \\\n"
+                "  --sample-rewards-json artifacts/runs/20260426-064318-sample-rewards-32eval/sample_rewards_final.json \\\n"
+                "  --output-dir presentation_graphs_out_final\n"
+                "cp presentation_graphs_out_final/*.png server/static/\n"
+            ),
+            lines=5,
+        )
+        gr.Code(
+            label="Notebook cell: live reward loop (execution-grounded)",
+            language="python",
+            interactive=False,
+            value=(
+                "with httpx.Client(base_url=ENV_URL, timeout=30.0) as client:\n"
+                "    client.post('/reset', json={'task_id': task}, headers={'X-Session-Id': sid})\n"
+                "    resp = client.post('/step', json={'action': {'action_type': 'submit_query', 'query': sql}},\n"
+                "                       headers={'X-Session-Id': sid})\n"
+                "    reward = resp.json().get('reward', 0.0)\n"
+                "    # reward drives policy updates and eval comparisons\n"
+            ),
+            lines=7,
+        )
+        gr.Markdown(
+            "### Failure taxonomy (from runtime debugging)\n"
+            "| Failure type | Typical symptom | Why execution feedback helps |\n"
+            "|---|---|---|\n"
+            "| Schema mismatch | unknown table/column | reward drops immediately and error details guide correction |\n"
+            "| Join logic bug | duplicated or missing rows | execution reveals semantic mismatch not visible in text quality |\n"
+            "| Aggregation bug | incorrect GROUP BY totals | deterministic graders expose numerical drift |\n"
+            "| Risky query behavior | unsafe or invalid action | reviewer path blocks while preserving learning signal |\n"
+        )
+        gr.Markdown('<p class="sde-section-title">Benchmark visuals</p>')
+        gr.Markdown(
+            "| Metric snapshot | Value |\n"
+            "|---|---|\n"
+            "| Spider chart: Industry baseline | **48.2%** |\n"
+            "| Spider chart: Qwen-7B base | **52.4%** |\n"
+            "| Spider chart: RL agent | **78.5%** |\n"
+            "| Performance leap chart | **0.0% -> 25.0%** (base to RL in that run view) |\n"
+        )
+        with gr.Row(equal_height=True):
+            if chart_leap:
+                gr.Image(value=chart_leap, label="Performance leap (Spider-style)", type="filepath", scale=1)
+            if chart_dual:
+                gr.Image(value=chart_dual, label="Comparison + reward shift", type="filepath", scale=2)
+            if chart_spider:
+                gr.Image(value=chart_spider, label="Spider-style headline chart", type="filepath", scale=1)
+        gr.Markdown(
+            '<p class="sde-section-title">Training run charts (repo static)</p>'
+            "<span style='color:#64748b;font-size:0.9rem'>Training plots from real runs. Regenerate with `presentation_graphs.py`; commit PNGs under `server/static/`.</span>"
+        )
+        with gr.Row():
+            if proof_combo:
+                gr.Image(value=proof_combo, label="Presentation combo", type="filepath", scale=1)
+            if proof_dist:
+                gr.Image(value=proof_dist, label="Reward distribution shift", type="filepath", scale=1)
+        if final_gallery:
+            gr.Markdown(
+                '<p class="sde-section-title">Hard-testing proof set (presentation_graphs_out_final)</p>'
+                "<span style='color:#64748b;font-size:0.9rem'>All generated graphs from the final evaluation set.</span>"
+            )
+            gr.Gallery(
+                value=final_gallery,
+                label="Final hard-testing charts",
+                preview=True,
+                columns=3,
+                height="auto",
+                object_fit="contain",
+            )
+        gr.Markdown('<p class="sde-section-title">Environment architecture</p>')
+        if wf:
+            gr.Image(value=wf, label="End-to-end workflow", type="filepath", show_label=True)
+        else:
+            gr.Markdown("*Add `server/static/environment-workflow.png`*")
+        gr.Markdown(
+            '<p class="sde-section-title">OpenEnv HTTP API</p>'
+            f"`GET /health` · `GET /tasks` · `POST /reset` · `POST /step` · `POST /step_with_review` · `GET /state` · `GET /benchmark` · "
+            f"loopback base `{_api_base()}` (override with **INTERNAL_API_BASE**)."
+        )
+        gr.Markdown('<p class="sde-section-title">Live playground</p>')
+        session = gr.State(None)
+        session_md = gr.Markdown("Session: *click “Reset task”*")
+        with gr.Row():
+            task = gr.Dropdown(
+                choices=task_choices,
+                value="easy_syntax_fix",
+                label="Task",
+                scale=1,
+            )
+            btn_reset = gr.Button("Reset task", variant="primary", scale=0, min_width=140)
+            btn_submit = gr.Button("Submit query", variant="secondary", scale=0, min_width=140)
+            btn_run_suite = gr.Button("Run preset suite", variant="secondary", scale=0, min_width=160)
+        preset_name = gr.Dropdown(
+            choices=_preset_options("easy_syntax_fix"),
+            value=_preset_options("easy_syntax_fix")[0],
+            label="Predefined test query",
+        )
+        btn_load_preset = gr.Button("Load predefined query", variant="secondary")
+        sql = gr.Code(label="Candidate SQL", language="sql", lines=12)
+        result_hint = gr.Markdown("")
+        with gr.Row():
+            obs_json = gr.Code(
+                language="json",
+                label="Observation (/reset)",
+                lines=12,
+                interactive=False,
+                scale=1,
+            )
+            step_json = gr.Code(
+                language="json",
+                label="Step (/step)",
+                lines=12,
+                interactive=False,
+                scale=1,
+            )
+        suite_md = gr.Markdown("")
+        suite_json = gr.Code(
+            label="Preset suite summary",
+            language="json",
+            lines=10,
+            interactive=False,
+        )
+        btn_reset.click(
+            reset_fn,
+            inputs=[task, session],
+            outputs=[obs_json, sql, session, session_md],
+        )
+        btn_submit.click(
+            submit_fn,
+            inputs=[sql, session],
+            outputs=[step_json, result_hint],
+        )
+        task.change(
+            lambda t: gr.Dropdown(
+                choices=_preset_options(t),
+                value=_preset_options(t)[0] if _preset_options(t) else None,
+            ),
+            inputs=[task],
+            outputs=[preset_name],
+        )
+        btn_load_preset.click(
+            lambda t, p: _preset_query(t, p or ""),
+            inputs=[task, preset_name],
+            outputs=[sql],
+        )
+        btn_run_suite.click(
+            run_preset_suite,
+            inputs=[task, session],
+            outputs=[suite_md, suite_json, session, session_md],
+        )
+        gr.Markdown('<p class="sde-section-title">Blog</p>')
+        gr.Markdown(blog_md)
+        gr.Markdown(
+            "### Why I picked SQL debugging and why this architecture exists\n"
+            "“The goal is not to generate beautiful SQL text. The goal is to produce SQL fixes that survive execution, repeatedly, under changing runtime conditions.”\n\n"
+            "SQL debugging is one of the few tasks where language quality and system quality can diverge sharply. A query can be grammatically neat, semantically plausible, and still fail in production. "
+            "I chose this problem because it forces an agent to optimize for *behavior under execution*, not only style under prompting."
+        )
+        gr.HTML(
+            """
+<div class="sde-kpi-grid">
+  <div class="sde-kpi"><div class="v">0.5B -> 7B</div><div class="k">Model track from first bridge run to main baseline.</div></div>
+  <div class="sde-kpi"><div class="v">32-run eval</div><div class="k">Final artifact path with sample rewards and run logs.</div></div>
+  <div class="sde-kpi"><div class="v">Execution-first</div><div class="k">Reward is computed from runtime outcomes, not prompt resemblance.</div></div>
+  <div class="sde-kpi"><div class="v">Traceable claims</div><div class="k">Metrics should map back to run files and checkpoints.</div></div>
+</div>
+            """.strip()
+        )
+        gr.Markdown(
+            "#### OpenEnv framing (why this is not just a demo UI)\n"
+            "The environment follows an OpenEnv-style interface: `reset -> observation`, `step(action) -> observation, reward, done, info`. "
+            "This is important because it gives the training loop a stable contract. Every algorithmic change can be tested against the same API semantics, which improves reproducibility.\n\n"
+            "#### Reward math (what is actually optimized)\n"
+            "At a high level, each step reward is composed from executed outcomes:\n\n"
+            "\\[\n"
+            "R_t = w_c C_t + w_e E_t + w_p P_t + w_s S_t - \\lambda \\cdot \\text{Penalty}_t\n"
+            "\\]\n\n"
+            "- \\(C_t\\): correctness signal (did query satisfy the task objective)\n"
+            "- \\(E_t\\): execution quality (valid execution / error handling)\n"
+            "- \\(P_t\\): progress toward a valid fix\n"
+            "- \\(S_t\\): schema-aware behavior bonus\n"
+            "- Penalty: unsafe / invalid / degenerate behavior\n\n"
+            "Episode objective:\n\n"
+            "\\[\n"
+            "J(\\pi) = \\mathbb{E}_{\\tau \\sim \\pi}\\left[\\sum_{t=0}^{T} \\gamma^t R_t\\right]\n"
+            "\\]\n\n"
+            "This makes the optimization target explicit: not token similarity, but expected runtime return.\n\n"
+            "#### Architecture decisions that matter technically\n"
+            "1. **Session-isolated database state**: each episode gets a clean in-memory SQLite environment.\n"
+            "2. **Deterministic tasks/graders**: stable reward surfaces for comparison across runs.\n"
+            "3. **Reviewer-guard path**: risk control without collapsing the learning signal.\n"
+            "4. **Typed observations + action history**: easier debugging and post-hoc analysis.\n\n"
+            "#### Data and reporting stats on this page\n"
+            "| Metric | Value | Source |\n"
+            "|---|---:|---|\n"
+            "| Spider-style industry baseline | 48.2% | chart-spider-benchmark |\n"
+            "| Qwen-7B base | 52.4% | chart-spider-benchmark |\n"
+            "| RL agent headline | 78.5% | chart-spider-benchmark |\n"
+            "| Performance leap view | 0.0% -> 25.0% | chart-performance-leap |\n"
+            "| Eval artifact pass | 32-run | HF run folder + sample rewards |\n\n"
+            "#### Why start with 0.5B then move to 7B\n"
+            "The first bridge run on **Qwen2.5-Coder-0.5B** is intentionally about speed of iteration: verify environment wiring, reward path, and notebook workflow quickly. "
+            "The **7B track** is then used for stronger SQL reasoning capacity and better convergence under execution-grounded rewards.\n\n"
+            "#### Motivation recap\n"
+            "I did not build this to prove that a model can emit valid-looking SQL. I built it to make SQL repair measurable as an engineering problem under runtime constraints. "
+            "The evidence-first layout (first context, live loop, artifact chain) is deliberate: each reported number should be traceable to run data, not presentation-only visuals."
+        )
+        gr.Markdown(
+            f"- [Google Cloud: techniques for improving text-to-SQL]({GCLOUD_TEXT2SQL_BLOG})\n"
+            f"- [OurBench / Squirrel: enterprise SQL debugging benchmark]({OURBENCH_PAPER})\n"
+            f"- [Writing inspiration: Karpathy field-notes style]({KARPATHY_STYLE_REFERENCE})\n"
+            "- [Final public blog URL (replace)](#)\n"
+            "- [Slides (URL)](#)\n"
+            "- [Demo video (URL)](#)"
+        )
+    return demo
+def mount_gradio(app: Any, static_dir: Path) -> Any:
+    """Mount single-page Gradio at `/` (Space home) while API routes stay on the same app."""
+    import gradio as gr
+    font = gr.themes.GoogleFont("Plus Jakarta Sans")
+    mono = gr.themes.GoogleFont("JetBrains Mono")
+    theme = gr.themes.Soft(
+        primary_hue="indigo",
+        secondary_hue="slate",
+        neutral_hue="slate",
+        font=(font, "ui-sans-serif", "system-ui"),
+        font_mono=(mono, "ui-monospace", "monospace"),
+    )
+    blocks = build_blocks(static_dir)
+    return gr.mount_gradio_app(
+        app,
+        blocks,
+        path="/gradio",
+        theme=theme,
+        css=GRADIO_CSS,
+        allowed_paths=[str(static_dir.resolve())],
+    )

server/main.py CHANGED Viewed

@@ -8,10 +8,13 @@ import time
 import statistics
 from typing import Dict, Optional, List, Any
 from contextlib import asynccontextmanager
 import sqlite3
 from fastapi import FastAPI, HTTPException, Header, Body
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from .models import SQLDebugAction, SQLDebugObservation, EpisodeState
@@ -47,13 +50,28 @@ app.add_middleware(
     allow_headers=["*"],
 )
 @app.get("/")
-async def root():
     return {
         "name": "sql-debug-env",
         "status": "ok",
         "message": "Use /health, /tasks, /reset, /step, /state, /benchmark",
     }
@@ -62,6 +80,31 @@ async def favicon():
     return None
 class ResetRequest(BaseModel):
     task_id: Optional[str] = "easy_syntax_fix"
@@ -341,3 +384,9 @@ async def state(x_session_id: Optional[str] = Header(default=None)):
         return current_state.model_dump()
     except RuntimeError as e:
         raise HTTPException(status_code=400, detail=str(e))

 import statistics
 from typing import Dict, Optional, List, Any
 from contextlib import asynccontextmanager
+from pathlib import Path
 import sqlite3
 from fastapi import FastAPI, HTTPException, Header, Body
+from fastapi.responses import HTMLResponse, RedirectResponse
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from .models import SQLDebugAction, SQLDebugObservation, EpisodeState
     allow_headers=["*"],
 )
+_static_dir = Path(__file__).resolve().parent / "static"
+if _static_dir.is_dir():
+    app.mount("/static", StaticFiles(directory=str(_static_dir)), name="static")
 @app.get("/")
+async def space_home():
+    """Hugging Face Space opens here — send humans to the Gradio dashboard."""
+    return RedirectResponse(url="/gradio/", status_code=302)
+@app.get("/api/info")
+async def api_info():
+    """Machine-readable index (JSON clients that used to hit `/`)."""
     return {
         "name": "sql-debug-env",
         "status": "ok",
         "message": "Use /health, /tasks, /reset, /step, /state, /benchmark",
+        "demo": "/demo",
+        "demo_page": "/server/demo_page.html",
+        "gradio": "/gradio",
+        "info": "/api/info",
     }
     return None
+_DEMO_PAGE_PATH = Path(__file__).resolve().parent / "demo_page.html"
+def _read_demo_page_html() -> str:
+    """Load the Space demo HTML from disk (next to this module)."""
+    if not _DEMO_PAGE_PATH.is_file():
+        return (
+            "<!doctype html><html><body style='font-family:sans-serif;padding:2rem'>"
+            "<p><strong>demo_page.html</strong> is missing next to <code>main.py</code>.</p></body></html>"
+        )
+    return _DEMO_PAGE_PATH.read_text(encoding="utf-8")
+@app.get("/demo", response_class=HTMLResponse)
+async def demo_page():
+    """Submission-ready demo + proof page."""
+    return _read_demo_page_html()
+@app.get("/server/demo_page.html", response_class=HTMLResponse)
+async def demo_page_repo_path():
+    """Same page as /demo — URL matches the repo path for HF Space links and bookmarks."""
+    return _read_demo_page_html()
 class ResetRequest(BaseModel):
     task_id: Optional[str] = "easy_syntax_fix"
         return current_state.model_dump()
     except RuntimeError as e:
         raise HTTPException(status_code=400, detail=str(e))
+# Gradio UI on the same Space (mounted after all API routes)
+from .gradio_ui import mount_gradio
+app = mount_gradio(app, _static_dir)