sql-debug-env / server /demo_page.html
md896's picture
Use .git GitHub URL in HTML and Gradio links.
72d394b
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1, viewport-fit=cover" />
<meta name="color-scheme" content="light" />
<meta name="theme-color" content="#f6f7fb" />
<meta name="description" content="SQL Debug OpenEnv: architecture, live /reset and /step playground, and training evidence. Hugging Face Space." />
<meta property="og:title" content="SQL Debug Environment — Space Demo" />
<meta property="og:description" content="OpenEnv-compliant SQL debugging environment with live rewards, GRPO training hooks, and reproducible artifacts." />
<meta property="og:type" content="website" />
<title>SQL Debug Environment · Hugging Face Space</title>
<link rel="preconnect" href="https://fonts.googleapis.com" />
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,500&family=Fraunces:ital,opsz,wght@0,9..144,500;0,9..144,600;0,9..144,700;1,9..144,500&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet" />
<style>
:root {
--space-bg: #f0f2f6;
--space-bg-elevated: #fafbfe;
--space-border: #e2e6ef;
--space-border-strong: #cdd5e5;
--ink: #111827;
--ink-soft: #374151;
--muted: #6b7280;
--muted-light: #9ca3af;
--card: #ffffff;
--card-shadow: 0 1px 2px rgba(16, 24, 40, 0.04), 0 8px 28px rgba(16, 24, 40, 0.06);
--card-shadow-hover: 0 1px 2px rgba(16, 24, 40, 0.06), 0 12px 36px rgba(16, 24, 40, 0.08);
--hf-amber: #f59e0b;
--hf-amber-soft: #fff7ed;
--accent: #2563eb;
--accent-soft: #eff6ff;
--diagram-bg: #0c1222;
--diagram-border: #1e293b;
--radius: 14px;
--radius-lg: 20px;
--font: "DM Sans", system-ui, -apple-system, sans-serif;
--font-display: "Fraunces", Georgia, serif;
--font-mono: "JetBrains Mono", ui-monospace, monospace;
--safe-top: env(safe-area-inset-top, 0px);
--safe-bottom: env(safe-area-inset-bottom, 0px);
}
* { box-sizing: border-box; }
html {
scroll-behavior: smooth;
scroll-padding-top: 112px;
}
body {
margin: 0;
font-family: var(--font);
color: var(--ink);
background: var(--space-bg);
min-height: 100vh;
min-height: 100dvh;
line-height: 1.55;
-webkit-font-smoothing: antialiased;
}
a { color: var(--accent); }
a:focus-visible, button:focus-visible, select:focus-visible, textarea:focus-visible {
outline: 2px solid var(--accent);
outline-offset: 2px;
}
.space-shell {
min-height: 100vh;
min-height: 100dvh;
display: flex;
flex-direction: column;
}
.space-banner {
position: sticky;
top: 0;
z-index: 40;
padding: calc(10px + var(--safe-top)) 16px 10px;
background: linear-gradient(180deg, rgba(255,255,255,0.96) 0%, rgba(250,251,254,0.94) 100%);
backdrop-filter: blur(12px);
-webkit-backdrop-filter: blur(12px);
border-bottom: 1px solid var(--space-border);
box-shadow: 0 4px 24px rgba(15, 23, 42, 0.04);
}
.space-banner-inner {
max-width: 1120px;
margin: 0 auto;
display: flex;
flex-wrap: wrap;
align-items: center;
justify-content: space-between;
gap: 12px 20px;
}
.space-brand {
display: flex;
align-items: center;
gap: 12px;
flex: 1 1 auto;
min-width: 0;
}
.space-logo {
width: 38px;
height: 38px;
border-radius: 10px;
background: linear-gradient(135deg, #fbbf24, #f59e0b);
box-shadow: 0 2px 8px rgba(245, 158, 11, 0.35);
flex-shrink: 0;
}
.space-brand h1 {
margin: 0;
font-family: var(--font-display);
font-size: 1.05rem;
font-weight: 600;
letter-spacing: -0.02em;
color: var(--ink);
line-height: 1.2;
}
.space-brand p {
margin: 2px 0 0;
font-size: 0.75rem;
color: var(--muted);
font-weight: 500;
}
.space-actions {
display: flex;
flex-wrap: wrap;
align-items: center;
gap: 8px;
}
.btn-ghost {
display: inline-flex;
align-items: center;
justify-content: center;
gap: 6px;
padding: 8px 14px;
font-size: 0.8125rem;
font-weight: 600;
font-family: inherit;
color: var(--ink-soft);
background: var(--card);
border: 1px solid var(--space-border-strong);
border-radius: 999px;
text-decoration: none;
cursor: pointer;
transition: border-color 0.15s, box-shadow 0.15s, background 0.15s;
}
.btn-ghost:hover {
border-color: var(--muted-light);
box-shadow: var(--card-shadow);
}
.btn-primary {
display: inline-flex;
align-items: center;
justify-content: center;
gap: 6px;
padding: 8px 16px;
font-size: 0.8125rem;
font-weight: 700;
font-family: inherit;
color: #1c1917;
background: linear-gradient(180deg, #fde68a, #fbbf24);
border: 1px solid #d97706;
border-radius: 999px;
text-decoration: none;
cursor: pointer;
box-shadow: 0 1px 0 rgba(255,255,255,0.5) inset;
transition: filter 0.15s, transform 0.1s;
}
.btn-primary:hover { filter: brightness(1.03); }
.btn-primary:active { transform: scale(0.98); }
.sticky-nav {
position: sticky;
top: calc(58px + var(--safe-top));
z-index: 30;
margin: 0 auto;
max-width: 1120px;
padding: 0 16px 8px;
}
.sticky-nav-inner {
display: flex;
flex-wrap: wrap;
gap: 6px;
padding: 6px;
background: var(--card);
border: 1px solid var(--space-border);
border-radius: 999px;
box-shadow: var(--card-shadow);
width: fit-content;
max-width: 100%;
}
.sticky-nav a {
padding: 8px 14px;
font-size: 0.78rem;
font-weight: 600;
color: var(--muted);
text-decoration: none;
border-radius: 999px;
transition: background 0.15s, color 0.15s;
white-space: nowrap;
}
.sticky-nav a:hover {
color: var(--ink);
background: var(--space-bg);
}
.main {
flex: 1;
max-width: 1120px;
margin: 0 auto;
padding: 8px 16px calc(32px + var(--safe-bottom));
width: 100%;
}
.api-strip {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-bottom: 20px;
}
.api-chip {
font-family: var(--font-mono);
font-size: 0.68rem;
font-weight: 500;
padding: 5px 10px;
border-radius: 8px;
background: var(--card);
border: 1px solid var(--space-border);
color: var(--ink-soft);
}
.api-chip span { color: var(--muted); margin-right: 6px; }
.section {
margin-bottom: 28px;
}
.section-id {
font-size: 0.65rem;
font-weight: 700;
letter-spacing: 0.18em;
text-transform: uppercase;
color: var(--hf-amber);
margin-bottom: 8px;
}
.hero-title {
font-family: var(--font-display);
font-weight: 600;
font-size: clamp(1.75rem, 4.2vw, 2.5rem);
line-height: 1.12;
margin: 0 0 12px;
letter-spacing: -0.02em;
color: var(--ink);
}
.hero-title em {
font-style: italic;
color: var(--accent);
}
.lede {
max-width: 54ch;
color: var(--muted);
font-size: 1rem;
margin: 0 0 18px;
}
.layer-strip {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-bottom: 20px;
}
.layer {
font-size: 0.68rem;
font-weight: 700;
letter-spacing: 0.05em;
text-transform: uppercase;
padding: 6px 11px;
border-radius: 8px;
border: 1px solid var(--space-border);
background: var(--space-bg-elevated);
color: var(--muted);
}
.layer b { color: var(--ink); }
.panel {
background: var(--card);
border: 1px solid var(--space-border);
border-radius: var(--radius-lg);
padding: 20px;
box-shadow: var(--card-shadow);
margin-bottom: 20px;
transition: box-shadow 0.2s;
}
.panel:hover { box-shadow: var(--card-shadow-hover); }
.panel-header {
display: flex;
flex-wrap: wrap;
align-items: flex-start;
justify-content: space-between;
gap: 12px;
margin-bottom: 14px;
}
.panel-header h2 {
margin: 0;
font-size: 1.1rem;
font-weight: 700;
color: var(--ink);
}
.panel-header .caption {
margin: 0;
font-size: 0.8125rem;
color: var(--muted);
max-width: 38ch;
line-height: 1.45;
}
.diagram-wrap {
border-radius: var(--radius);
overflow: auto;
-webkit-overflow-scrolling: touch;
background: var(--diagram-bg);
border: 1px solid var(--diagram-border);
box-shadow: inset 0 1px 0 rgba(255,255,255,0.06);
max-height: min(92vh, 1400px);
}
.diagram-wrap img {
display: block;
width: 100%;
max-width: 100%;
height: auto;
max-height: none;
object-fit: contain;
object-position: center top;
cursor: zoom-in;
}
img.sde-zoomable {
cursor: zoom-in;
transition: box-shadow 0.15s ease, transform 0.12s ease;
}
img.sde-zoomable:hover {
box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.35);
}
.figure-footer {
display: flex;
flex-wrap: wrap;
justify-content: space-between;
gap: 10px;
padding-top: 14px;
margin-top: 4px;
font-size: 0.75rem;
color: var(--muted);
}
.legend {
display: flex;
flex-wrap: wrap;
gap: 12px;
}
.legend span::before {
content: "";
display: inline-block;
width: 7px;
height: 7px;
border-radius: 2px;
margin-right: 5px;
vertical-align: middle;
}
.legend .l-api::before { background: #22c55e; }
.legend .l-env::before { background: #a78bfa; }
.legend .l-data::before { background: #fb923c; }
.legend .l-train::before { background: #2dd4bf; }
.badges {
display: flex;
flex-wrap: wrap;
gap: 8px;
}
.badge {
font-size: 0.65rem;
font-weight: 700;
letter-spacing: 0.05em;
text-transform: uppercase;
padding: 5px 10px;
border-radius: 999px;
border: 1px solid var(--space-border);
color: var(--muted);
background: var(--space-bg-elevated);
}
.section-head {
margin-bottom: 14px;
}
.section-head h2 {
margin: 0 0 6px;
font-family: var(--font-display);
font-size: 1.35rem;
font-weight: 600;
color: var(--ink);
}
.section-head p {
margin: 0;
color: var(--muted);
font-size: 0.9375rem;
}
.grid {
display: grid;
gap: 16px;
grid-template-columns: 1fr;
}
@media (min-width: 860px) {
.grid.cols-2 { grid-template-columns: 1fr 1fr; }
.grid.cols-12 { grid-template-columns: repeat(12, 1fr); }
.span-4 { grid-column: span 4; }
.span-8 { grid-column: span 8; }
}
.play-card {
background: var(--card);
border: 1px solid var(--space-border);
border-radius: var(--radius-lg);
padding: 20px;
box-shadow: var(--card-shadow);
}
label {
display: block;
font-size: 0.7rem;
font-weight: 700;
letter-spacing: 0.07em;
text-transform: uppercase;
color: var(--muted);
margin-top: 14px;
margin-bottom: 6px;
}
label:first-of-type { margin-top: 0; }
select, textarea {
width: 100%;
font-family: inherit;
font-size: 0.9375rem;
border-radius: 10px;
border: 1px solid var(--space-border-strong);
background: var(--space-bg-elevated);
color: var(--ink);
padding: 12px 14px;
transition: border-color 0.15s, box-shadow 0.15s;
}
select:focus, textarea:focus {
outline: none;
border-color: var(--accent);
box-shadow: 0 0 0 3px var(--accent-soft);
}
textarea {
min-height: 140px;
resize: vertical;
font-family: var(--font-mono);
font-size: 0.8125rem;
line-height: 1.5;
}
.btn-action {
margin-top: 12px;
width: 100%;
min-height: 46px;
font-family: inherit;
font-size: 0.9375rem;
font-weight: 700;
cursor: pointer;
border-radius: 10px;
border: none;
color: #fff;
background: linear-gradient(135deg, #2563eb, #4f46e5);
box-shadow: 0 4px 14px rgba(37, 99, 235, 0.35);
transition: opacity 0.15s, transform 0.1s;
}
.btn-action:hover:not(:disabled) { filter: brightness(1.05); }
.btn-action:active:not(:disabled) { transform: scale(0.99); }
.btn-action:disabled {
opacity: 0.55;
cursor: not-allowed;
}
.session-pill {
display: inline-flex;
align-items: center;
gap: 8px;
font-size: 0.75rem;
color: var(--muted);
margin-bottom: 10px;
padding: 6px 12px;
background: var(--accent-soft);
border-radius: 999px;
border: 1px solid #bfdbfe;
}
.session-pill strong {
color: var(--accent);
font-family: var(--font-mono);
font-weight: 500;
font-size: 0.72rem;
}
code.pre {
display: block;
white-space: pre-wrap;
font-family: var(--font-mono);
font-size: 0.72rem;
line-height: 1.5;
background: #f8fafc;
border: 1px solid var(--space-border);
border-radius: 10px;
padding: 12px 14px;
color: #1e293b;
min-height: 72px;
max-height: 260px;
overflow: auto;
}
.proof-grid {
display: grid;
gap: 16px;
grid-template-columns: 1fr;
}
@media (min-width: 720px) {
.proof-grid { grid-template-columns: 1fr 1fr; }
}
.proof-card {
border-radius: var(--radius);
overflow: hidden;
border: 1px solid var(--space-border);
background: var(--card);
box-shadow: var(--card-shadow);
}
.proof-card figcaption {
padding: 10px 14px;
font-size: 0.8125rem;
color: var(--muted);
border-top: 1px solid var(--space-border);
background: var(--space-bg-elevated);
}
.proof-card img {
display: block;
width: 100%;
height: auto;
}
.metric-table {
width: 100%;
border-collapse: collapse;
font-size: 0.9375rem;
}
.metric-table th,
.metric-table td {
text-align: left;
padding: 10px 14px;
border-bottom: 1px solid var(--space-border);
vertical-align: top;
}
.metric-table thead th {
font-weight: 700;
color: var(--ink);
background: var(--space-bg-elevated);
font-size: 0.75rem;
letter-spacing: 0.06em;
text-transform: uppercase;
}
.metric-table tbody td:first-child {
color: var(--ink);
max-width: 28ch;
}
.metric-table tbody td:last-child {
font-weight: 700;
font-variant-numeric: tabular-nums;
color: var(--accent);
}
.metric-table tbody tr:last-child td {
border-bottom: none;
}
.benchmark-chart-grid {
display: grid;
gap: 16px;
grid-template-columns: 1fr;
margin-top: 18px;
}
@media (min-width: 900px) {
.benchmark-chart-grid {
grid-template-columns: repeat(3, minmax(0, 1fr));
}
}
/* Full-screen image viewer (click any .sde-zoomable) */
.img-lightbox {
position: fixed;
inset: 0;
z-index: 200;
display: flex;
align-items: center;
justify-content: center;
padding: calc(8px + env(safe-area-inset-top, 0px)) calc(8px + env(safe-area-inset-right, 0px)) calc(8px + env(safe-area-inset-bottom, 0px)) calc(8px + env(safe-area-inset-left, 0px));
box-sizing: border-box;
}
.img-lightbox[hidden] {
display: none !important;
}
.img-lightbox-backdrop {
position: absolute;
inset: 0;
border: none;
padding: 0;
margin: 0;
width: 100%;
height: 100%;
cursor: zoom-out;
background: rgba(15, 23, 42, 0.9);
backdrop-filter: blur(8px);
-webkit-backdrop-filter: blur(8px);
}
.img-lightbox-panel {
position: relative;
z-index: 1;
width: min(98vw, 1920px);
max-height: min(96vh, 1200px);
display: flex;
flex-direction: column;
background: #0b1220;
border-radius: var(--radius-lg);
border: 1px solid rgba(148, 163, 184, 0.35);
box-shadow: 0 28px 90px rgba(0, 0, 0, 0.55);
overflow: hidden;
}
.img-lightbox-close {
position: absolute;
top: 8px;
right: 10px;
z-index: 3;
width: 40px;
height: 40px;
border: none;
border-radius: 10px;
font-size: 1.5rem;
line-height: 1;
cursor: pointer;
color: #f8fafc;
background: rgba(30, 41, 59, 0.95);
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.25);
}
.img-lightbox-close:hover {
background: rgba(51, 65, 85, 0.98);
}
.img-lightbox-toolbar {
position: absolute;
top: 8px;
left: 10px;
z-index: 3;
display: flex;
flex-wrap: wrap;
gap: 6px;
}
.img-lightbox-toolbar button {
min-width: 40px;
height: 36px;
padding: 0 10px;
font-size: 0.9rem;
font-weight: 700;
font-family: inherit;
border-radius: 8px;
border: 1px solid rgba(148, 163, 184, 0.4);
cursor: pointer;
color: #e2e8f0;
background: rgba(30, 41, 59, 0.95);
}
.img-lightbox-toolbar button:hover {
background: rgba(51, 65, 85, 0.98);
}
.img-lightbox-hint {
display: block;
margin-top: 8px;
font-size: 0.72rem;
color: rgba(148, 163, 184, 0.95);
line-height: 1.35;
}
.img-lightbox-scroll {
flex: 1;
min-height: 120px;
overflow: auto;
-webkit-overflow-scrolling: touch;
padding: 52px 14px 44px;
text-align: center;
}
.img-lightbox-stage {
display: inline-block;
margin: 0 auto;
}
.img-lightbox-scroll img {
display: block;
max-width: none;
max-height: none;
margin: 0 auto;
vertical-align: middle;
/* width/height set in JS so overflow scroll tracks zoom */
}
.img-lightbox-caption {
margin: 0;
padding: 10px 14px 12px;
font-size: 0.8125rem;
color: #cbd5e1;
background: rgba(15, 23, 42, 0.6);
border-top: 1px solid rgba(148, 163, 184, 0.25);
text-align: center;
}
.link-list a {
color: var(--accent);
text-decoration: none;
font-weight: 600;
display: block;
padding: 10px 0;
border-bottom: 1px solid var(--space-border);
font-size: 0.9rem;
}
.link-list a:last-child { border-bottom: 0; }
.link-list a:hover { text-decoration: underline; }
.space-footer {
margin-top: auto;
padding: 20px 16px calc(16px + var(--safe-bottom));
border-top: 1px solid var(--space-border);
background: linear-gradient(180deg, var(--space-bg-elevated), var(--space-bg));
}
.space-footer-inner {
max-width: 1120px;
margin: 0 auto;
display: flex;
flex-wrap: wrap;
align-items: center;
justify-content: space-between;
gap: 12px;
font-size: 0.8125rem;
color: var(--muted);
}
.space-footer a { color: var(--muted); font-weight: 600; }
.space-footer a:hover { color: var(--ink); }
.blog-quote {
border-left: 4px solid #2563eb;
background: #eff6ff;
color: #1e3a8a;
padding: 10px 12px;
border-radius: 8px;
font-size: 0.9rem;
margin: 0 0 12px;
}
.blog-mini-grid {
display: grid;
grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 8px;
margin: 0 0 12px;
}
.blog-mini {
background: #f8fafc;
border: 1px solid var(--space-border);
border-radius: 10px;
padding: 10px;
font-size: 0.82rem;
color: var(--muted);
}
.blog-mini b { color: var(--ink); display:block; font-size:0.98rem; margin-bottom: 2px; }
@media (max-width: 900px) {
.blog-mini-grid { grid-template-columns: 1fr; }
}
.lede-stack {
max-width: 62ch;
margin-bottom: 18px;
}
.lede-stack .lede {
max-width: none;
}
.stat-callout {
margin: 0 0 16px;
padding: 14px 16px 16px;
border-radius: var(--radius);
border: 1px solid #c7d2fe;
background: linear-gradient(135deg, #eef2ff 0%, #f8fafc 55%, #ecfeff 100%);
box-shadow: 0 6px 22px rgba(37, 99, 235, 0.08);
font-size: 0.98rem;
line-height: 1.58;
color: var(--ink-soft);
}
.stat-callout strong {
color: var(--ink);
font-weight: 700;
}
.blog-pull-wide {
font-family: var(--font-display);
font-size: 1.02rem;
line-height: 1.45;
color: var(--ink);
margin: 18px 0 14px;
padding: 12px 0 12px 16px;
border-left: 4px solid var(--hf-amber);
background: linear-gradient(90deg, var(--hf-amber-soft), transparent);
border-radius: 0 10px 10px 0;
}
.blog-subhead {
font-size: 0.72rem;
font-weight: 800;
letter-spacing: 0.12em;
text-transform: uppercase;
color: var(--muted);
margin: 20px 0 8px;
}
.blog-list {
margin: 0 0 14px 1.1rem;
padding: 0;
color: var(--muted);
font-size: 0.9375rem;
line-height: 1.55;
}
.blog-list li { margin-bottom: 8px; }
.blog-footnote {
font-size: 0.78rem;
color: var(--muted-light);
line-height: 1.45;
margin: 10px 0 0;
padding-top: 10px;
border-top: 1px dashed var(--space-border);
}
</style>
</head>
<body>
<div class="space-shell">
<header class="space-banner">
<div class="space-banner-inner">
<div class="space-brand">
<div class="space-logo" aria-hidden="true"></div>
<div>
<h1>SQL Debug Environment</h1>
<p>OpenEnv · FastAPI · Live SQL rewards</p>
</div>
</div>
<div class="space-actions">
<a class="btn-primary" href="/gradio/">Gradio UI</a>
<a class="btn-ghost" href="https://github.com/mdayan8/sql-debug-env.git" target="_blank" rel="noopener">GitHub</a>
<button type="button" class="btn-ghost" id="btnOpenTab" title="Opens this demo in a full browser tab">Open full page</button>
<a class="btn-ghost" href="https://huggingface.co/spaces/md896/sql-debug-env" target="_blank" rel="noopener">Space on Hub ↗</a>
</div>
</div>
</header>
<nav class="sticky-nav" aria-label="On-page navigation">
<div class="sticky-nav-inner">
<a href="#environment">Environment</a>
<a href="#first-training">First Training</a>
<a href="#playground">Playground</a>
<a href="#benchmark-visuals">Benchmark</a>
<a href="#evidence">Evidence</a>
<a href="#repro">Reproduce</a>
<a href="/gradio/">Gradio</a>
</div>
</nav>
<main class="main">
<div class="api-strip" aria-label="Key API endpoints">
<span class="api-chip"><span>GET</span>/health</span>
<span class="api-chip"><span>GET</span>/tasks</span>
<span class="api-chip"><span>POST</span>/reset</span>
<span class="api-chip"><span>POST</span>/step</span>
<span class="api-chip"><span>POST</span>/step_with_review</span>
<span class="api-chip"><span>GET</span>/benchmark</span>
</div>
<section id="environment" class="section" aria-labelledby="env-title">
<p class="section-id">Space · Architecture</p>
<h2 class="hero-title" id="env-title">Environment first — <em>how</em> the agent sees the world.</h2>
<div class="lede-stack">
<p class="stat-callout">
<strong>Today, nearly 30% of a data team’s time is spent fixing SQL and pipeline logic</strong>—not building net-new insights, not shipping product features,
but <em>debugging queries that already looked reasonable in a notebook or PR comment</em>. That tax shows up as rework, stale dashboards, and fragile “one-off”
analyses that nobody trusts after the third incident.
</p>
<p class="lede">
<strong>Even with the most advanced AI models, the problem is not “solved.”</strong>
On standard text-to-SQL benchmarks like Spider, headline numbers often sit in the <strong>high 80s to low 90s (%)</strong>—an impressive story for a slide deck.
In real enterprise environments—drifting schemas, implicit business rules, join explosions, and permissioned views—that headline rarely survives contact with production.
Teams routinely report effective success rates closer to the <strong>10–30%</strong> band unless the system closes the loop with <em>execution-grounded feedback</em>
(run, observe error or result, attribute reward to what changed).
</p>
<p class="lede" style="margin-bottom:0">
This Space hosts the same HTTP API your trainer calls: <strong>sessions</strong>, <strong>typed observations</strong>, <strong>SQLite-backed tasks</strong>, and a
<strong>decomposed reward</strong>. Below is the end-to-end workflow map at a glance; Engineering Notes connect the problem to the OpenEnv contract and the artifacts on this page.
</p>
</div>
<div class="layer-strip" aria-hidden="true">
<span class="layer"><b>Client</b> / agent</span>
<span class="layer"><b>API</b> session + JSON</span>
<span class="layer"><b>Env</b> SQLDebugEnv</span>
<span class="layer"><b>Data</b> tasks + SQLite</span>
<span class="layer"><b>Train</b> GRPO + artifacts</span>
</div>
<div class="panel">
<div class="panel-header">
<h2>Environment visualization</h2>
<p class="caption">Runtime flow (solid) vs training and ops (dashed). Reviewer-guarded path optional for safer rollouts.</p>
</div>
<div class="diagram-wrap">
<img class="sde-zoomable" src="/static/diagram-end-to-end-workflow.png" alt="End-to-end workflow: Client, FastAPI, environment core, data and reward layer, training and deployment." width="1600" height="900" loading="eager" decoding="async" title="Click to open full-size viewer (zoom and pan)" />
</div>
<div class="figure-footer">
<div class="legend">
<span class="l-api">API</span>
<span class="l-env">Env core</span>
<span class="l-data">DB / tasks / reward</span>
<span class="l-train">Training &amp; Space</span>
</div>
<span>sql-debug-env workflow</span>
</div>
</div>
<div class="badges">
<span class="badge">OpenEnv</span>
<span class="badge">TRL · GRPO</span>
<span class="badge">Live rewards</span>
<span class="badge">Reviewer path</span>
</div>
</section>
<section id="first-training" class="section" aria-labelledby="first-training-title">
<div class="section-head">
<p class="section-id">Training · First Context</p>
<h2 id="first-training-title">Start with the first bridge run</h2>
<p>This is the exact first training context you shared: dependency bootstrap, W&amp;B tracking, then benchmark/eval steps.</p>
</div>
<div class="grid cols-12">
<div class="play-card span-4">
<div class="link-list">
<a href="https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5#scrollTo=j-9MptXvmPk8" target="_blank" rel="noopener">First training context (Colab anchor)</a>
<a href="https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5#scrollTo=x5YuvatGyyu_" target="_blank" rel="noopener">Full training notebook anchor</a>
<a href="https://wandb.ai/mdayanbag-pesitm/sql-debug-grpo-best-budget/workspace?nw=nwusermdayanbag" target="_blank" rel="noopener">W&amp;B workspace: sql-debug-grpo-best-budget</a>
<a href="https://huggingface.co/spaces/md896/sql-debug-env/tree/main/artifacts/runs/20260426-064318-sample-rewards-32eval" target="_blank" rel="noopener">Sample rewards (32-eval) artifacts</a>
<a href="https://huggingface.co/md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2" target="_blank" rel="noopener">Model card (winner)</a>
</div>
</div>
<div class="play-card span-8">
<label>First training context code</label>
<code class="pre"># SQL Debug Env: FINAL REAL-WORLD BRIDGE
import os
print("Checking libraries...")
os.system("pip install trl accelerate wandb -U")
import httpx
import torch
import wandb
# W&B workspace: https://wandb.ai/mdayanbag-pesitm/sql-debug-grpo-best-budget/workspace?nw=nwusermdayanbag</code>
</div>
</div>
</section>
<section id="playground" class="section" aria-labelledby="play-title">
<div class="section-head">
<p class="section-id">Live · Playground</p>
<h2 id="play-title">Try <code style="font-family:var(--font-mono);font-size:0.85em;background:#f1f5f9;padding:2px 6px;border-radius:4px">/reset</code> and <code style="font-family:var(--font-mono);font-size:0.85em;background:#f1f5f9;padding:2px 6px;border-radius:4px">/step</code> from the browser</h2>
<p>Use the same <strong>X-Session-Id</strong> header on every call (here: <code style="font-family:var(--font-mono);font-size:0.85em">demo-session</code>).</p>
</div>
<div class="grid cols-2">
<div class="play-card">
<label for="taskId">Task</label>
<select id="taskId" aria-label="Select task">
<option value="easy_syntax_fix">easy_syntax_fix</option>
<option value="medium_logic_fix">medium_logic_fix</option>
<option value="hard_multi_bug">hard_multi_bug</option>
<option value="hard_finance_explosion">hard_finance_explosion</option>
</select>
<button type="button" class="btn-action" id="btnReset" onclick="resetTask()">Reset task</button>
<label for="query">Candidate SQL</label>
<textarea id="query" placeholder="SELECT ..." aria-label="SQL query"></textarea>
<button type="button" class="btn-action" id="btnSubmit" onclick="submitQuery()">Submit query</button>
</div>
<div class="play-card">
<div class="session-pill">Session <strong>demo-session</strong></div>
<label>Task observation</label>
<code id="observation" class="pre">Run “Reset task” to load the broken query and observation JSON.</code>
<label style="margin-top:14px">Step result</label>
<code id="result" class="pre">Submit a query to see reward, done, and info.</code>
</div>
</div>
</section>
<section id="benchmark-visuals" class="section" aria-labelledby="benchmark-visuals-title">
<div class="section-head">
<p class="section-id">Evidence · Charts</p>
<h2 id="benchmark-visuals-title">Benchmark visuals</h2>
<p>Metric snapshot aligned with committed charts under <code style="font-family:var(--font-mono);font-size:0.85em;background:#f1f5f9;padding:2px 6px;border-radius:4px">server/static/</code> (same figures as the Gradio page).</p>
</div>
<div class="panel">
<table class="metric-table" aria-label="Benchmark metric snapshot">
<thead>
<tr>
<th scope="col">Metric snapshot</th>
<th scope="col">Value</th>
</tr>
</thead>
<tbody>
<tr>
<td>Spider chart: Industry baseline</td>
<td>48.2%</td>
</tr>
<tr>
<td>Spider chart: Qwen-7B base</td>
<td>52.4%</td>
</tr>
<tr>
<td>Spider chart: RL agent</td>
<td>78.5%</td>
</tr>
<tr>
<td>Performance leap chart</td>
<td>0.0% -&gt; 25.0% (base to RL in that run view)</td>
</tr>
</tbody>
</table>
</div>
<div class="benchmark-chart-grid">
<figure class="proof-card">
<img class="sde-zoomable" src="/static/chart-performance-leap.png" alt="Performance leap: baseline versus RL on a Spider-style headline view" width="900" height="520" loading="lazy" decoding="async" title="Click to open full-size viewer (zoom and pan)" />
<figcaption>Performance leap (Spider-style)</figcaption>
</figure>
<figure class="proof-card">
<img class="sde-zoomable" src="/static/chart-comparison-shift.png" alt="Comparison of models and reward distribution shift" width="900" height="520" loading="lazy" decoding="async" title="Click to open full-size viewer (zoom and pan)" />
<figcaption>Comparison + reward shift</figcaption>
</figure>
<figure class="proof-card">
<img class="sde-zoomable" src="/static/chart-spider-benchmark.png" alt="Spider-style benchmark headline chart across industry baseline, Qwen-7B base, and RL agent" width="900" height="520" loading="lazy" decoding="async" title="Click to open full-size viewer (zoom and pan)" />
<figcaption>Spider-style headline chart</figcaption>
</figure>
</div>
</section>
<section id="evidence" class="section" aria-labelledby="evidence-title">
<div class="section-head">
<p class="section-id">Evidence · Artifacts</p>
<h2 id="evidence-title">Training plots from real runs</h2>
<p>Regenerate with <code style="font-family:var(--font-mono);font-size:0.85em">presentation_graphs.py</code>; commit PNGs under <code style="font-family:var(--font-mono);font-size:0.85em">server/static/</code>.</p>
</div>
<div class="proof-grid">
<figure class="proof-card">
<img class="sde-zoomable" src="/static/proof-combo.png" alt="Presentation combo chart from training run" width="1200" height="800" loading="lazy" decoding="async" title="Click to open full-size viewer (zoom and pan)" />
<figcaption>Presentation combo — logged metrics.</figcaption>
</figure>
<figure class="proof-card">
<img class="sde-zoomable" src="/static/proof-distribution-shift.png" alt="Reward distribution shift" width="1200" height="800" loading="lazy" decoding="async" title="Click to open full-size viewer (zoom and pan)" />
<figcaption>Per-sample reward shift (baseline vs trained).</figcaption>
</figure>
</div>
<div class="link-list" style="margin-top:12px">
<a href="/static/training_reward_curve_final.png" target="_blank" rel="noopener">training_reward_curve_final.png</a>
<a href="/static/training_diagnostics_dual_axis_final.png" target="_blank" rel="noopener">training_diagnostics_dual_axis_final.png</a>
<a href="/static/baseline_vs_trained_by_task_final.png" target="_blank" rel="noopener">baseline_vs_trained_by_task_final.png</a>
<a href="/static/task_delta_post_minus_base_final.png" target="_blank" rel="noopener">task_delta_post_minus_base_final.png</a>
<a href="/static/reward_distribution_shift_red_green_final.png" target="_blank" rel="noopener">reward_distribution_shift_red_green_final.png</a>
<a href="/static/presentation_combo_final.png" target="_blank" rel="noopener">presentation_combo_final.png</a>
<a href="/static/benchmark_style_summary_final.png" target="_blank" rel="noopener">benchmark_style_summary_final.png</a>
<a href="/static/checkpoint_leaderboard_step_vs_reward_final.png" target="_blank" rel="noopener">checkpoint_leaderboard_step_vs_reward_final.png</a>
<a href="/static/cost_vs_performance_final.png" target="_blank" rel="noopener">cost_vs_performance_final.png</a>
</div>
</section>
<section id="repro" class="section">
<div class="grid cols-12">
<div class="play-card span-4">
<div class="section-head" style="margin-bottom:10px">
<p class="section-id">Reproduce</p>
<h2 style="font-family:var(--font-display);font-size:1.15rem;margin:0;font-weight:600">Runs &amp; assets</h2>
</div>
<div class="link-list">
<a href="https://colab.research.google.com/drive/1H6SLfCBhHzRJtnymLgevjfyytWUximF5#scrollTo=x5YuvatGyyu_" target="_blank" rel="noopener">Colab training notebook</a>
<a href="https://huggingface.co/spaces/md896/sql-debug-env/tree/main/artifacts/runs/20260426-060502-final-pass-32eval" target="_blank" rel="noopener">Eval artifacts (32-run)</a>
<a href="https://huggingface.co/md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2" target="_blank" rel="noopener">Model card</a>
<a href="/benchmark" target="_blank" rel="noopener">Benchmark JSON</a>
<a href="/health" target="_blank" rel="noopener">Health</a>
</div>
</div>
<div class="play-card span-8">
<div class="section-head" style="margin-bottom:10px">
<p class="section-id">Engineering Notes</p>
<h2 style="font-family:var(--font-display);font-size:1.15rem;margin:0;font-weight:600">Why I picked SQL debugging and why this architecture exists</h2>
</div>
<div class="blog-quote">
“The goal is not to generate beautiful SQL text. The goal is to produce SQL fixes that survive execution, repeatedly, under changing runtime conditions.”
</div>
<div class="blog-mini-grid">
<div class="blog-mini"><b>0.5B → 7B</b>Bridge run for wiring, then a stronger base model for SQL structure and joins.</div>
<div class="blog-mini"><b>32-run eval</b>Artifact-backed pass with sample rewards and run logs you can diff, not vibes.</div>
<div class="blog-mini"><b>Execution-first</b>Reward comes from running SQL against graded tasks—not from how persuasive the completion sounds.</div>
</div>
<div class="blog-mini-grid" style="margin-top:10px">
<div class="blog-mini"><b>Spider vs prod</b>Leaderboards reward clean splits; warehouses reward joins that do not explode under skew.</div>
<div class="blog-mini"><b>GRPO loop</b>Group-relative updates turn execution outcomes into a stable training signal across sessions.</div>
<div class="blog-mini"><b>Reviewer path</b>Optional guardrail so risky SQL is blocked without erasing every learning opportunity.</div>
</div>
<p class="blog-pull-wide">
If you only remember one tension from this page, remember this: <strong>high leaderboard accuracy is not the same thing as high production reliability.</strong>
</p>
<p style="color:var(--muted);margin:0 0 12px;font-size:0.9375rem">
The motive for this project was not to build another text-to-SQL demo. It was to shrink the gap between “model looks smart in a demo” and “model helps engineers ship.”
SQL bugs are expensive because they fail late: a query can pass review, pass linting, and still break under real schema constraints, stale statistics, or join cardinality shifts.
I picked this problem because it sits at the boundary between language modeling and systems engineering—if the agent improves here, it is learning runtime correctness, not cosmetic fluency.
</p>
<p class="blog-subhead">What leaderboards hide</p>
<p style="color:var(--muted);margin:0 0 12px;font-size:0.9375rem">
Spider-style suites are useful scientific instruments: they keep comparisons honest and reproducible. They are also intentionally cleaner than most corporate warehouses.
That is why you can simultaneously believe two facts that sound contradictory: models can score in the <strong>high 80s–90s (%)</strong> on canonical benchmarks while practitioners still describe
<strong>10–30%</strong> “works first time in our environment” outcomes unless they invest in evaluation harnesses, guardrails, and iterative repair loops grounded in execution.
</p>
<ul class="blog-list">
<li><strong>Latency of truth.</strong> Text-only feedback arrives early; execution feedback arrives when the query meets the database. The latter is slower but decisive.</li>
<li><strong>Credit assignment.</strong> Without runtime signal, you reward plausible prose. With it, you reward schema-correct joins, stable aggregates, and safe rewrites.</li>
<li><strong>Operational drift.</strong> Production schemas evolve; a static snapshot benchmark cannot represent every enterprise edge case—so the training surface must be repeatable even when the world is messy.</li>
</ul>
<p class="blog-subhead">Why the OpenEnv-shaped API exists</p>
<p style="color:var(--muted);margin:0 0 12px;font-size:0.9375rem">
The architecture follows an OpenEnv-style contract:
<code>reset → observation</code> and <code>step(action) → observation, reward, done, info</code>.
Each episode runs on isolated in-memory SQLite state, deterministic task grading, and execution-grounded rewards. That contract is what lets you compare runs, swap algorithms,
and keep the same measurement tape: valid table references, stable aggregations, and join logic that does not collapse in edge cases.
</p>
<code class="pre">Conceptual reward:
R_t = w_c*C_t + w_e*E_t + w_p*P_t + w_s*S_t - lambda*Penalty_t
Objective:
J(pi) = E_{tau ~ pi}[sum_{t=0..T} gamma^t * R_t]</code>
<p style="color:var(--muted);margin:0 0 12px;font-size:0.9375rem">
The technical design makes debugging measurable. Session state exposes observations, action history, and reward trajectories.
The reviewer-gated path adds risk control for unsafe submissions while preserving gradient signal (instead of hard-failing every risky step).
That gives the policy consequences it can learn from: what failed, why it failed, and how far a candidate moved toward a valid fix.
</p>
<code class="pre">Data snapshot shown on this page:
- Spider-style industry baseline: 48.2%
- Qwen-7B base: 52.4%
- RL agent headline: 78.5%
- Performance leap view: 0.0% -> 25.0%
- Hard evidence: 32-run eval + sample reward artifacts</code>
<p style="color:var(--muted);margin:12px 0 12px;font-size:0.9375rem">
Traceability is a product decision, not a footnote. This page is an evidence chain: first training context, live interaction, then artifact-backed plots.
If a metric appears, it should map to concrete run folders, reward JSON files, and checkpoint lineage—so a reviewer can reconstruct the claim without trusting a single screenshot.
</p>
<p class="blog-subhead">How to read what ships here</p>
<ul class="blog-list">
<li><strong>Environment diagram</strong> — the contract between client, API, env core, data layer, and training artifacts.</li>
<li><strong>Playground</strong> — the same <code>/reset</code> and <code>/step</code> loop your trainer uses, in-browser, with explicit session headers.</li>
<li><strong>Benchmark visuals + evidence PNGs</strong> — static exports committed under <code>server/static/</code>; regenerate from real run JSON when you change the story.</li>
</ul>
<p style="color:var(--muted);margin:0 0 12px;font-size:0.9375rem">
Industry and research converge on the same diagnosis: robust text-to-SQL needs context quality, intent handling, dialect robustness, and execution safeguards.
Enterprise SQL debugging stays painful when feedback is detached from runtime behavior. The objective of this Space is to close that gap with a reproducible,
execution-grounded learning loop you can fork, stress-test, and defend in a review.
</p>
<p class="blog-footnote">
Percent ranges (≈30% time on debugging work; ≈10–30% production success vs high-80s/90s benchmark headlines) summarize common practitioner reporting and public benchmark narratives;
your organization’s distributions will differ—treat them as motivation for measurement, not as universal constants.
</p>
<div class="link-list" style="margin-top:12px">
<a href="https://github.com/mdayan8/sql-debug-env.git" target="_blank" rel="noopener">GitHub — mdayan8/sql-debug-env</a>
<a href="https://cloud.google.com/blog/products/databases/techniques-for-improving-text-to-sql" target="_blank" rel="noopener">Google Cloud: techniques for improving text-to-SQL</a>
<a href="https://arxiv.org/abs/2601.18119" target="_blank" rel="noopener">OurBench / Squirrel: enterprise SQL debugging benchmark</a>
</div>
</div>
</div>
</section>
</main>
<footer class="space-footer">
<div class="space-footer-inner">
<span>Custom Space UI · FastAPI <code style="font-family:var(--font-mono);font-size:0.75em">/demo</code></span>
<span>
<a href="https://github.com/mdayan8/sql-debug-env.git" target="_blank" rel="noopener">GitHub</a>
·
<a href="https://huggingface.co/docs/hub/spaces" target="_blank" rel="noopener">Spaces docs</a>
·
<a href="https://huggingface.co/spaces/md896/sql-debug-env/tree/main" target="_blank" rel="noopener">Files &amp; versions</a>
</span>
</div>
</footer>
</div>
<div id="imgLightbox" class="img-lightbox" hidden role="dialog" aria-modal="true" aria-label="Full-size image viewer">
<button type="button" class="img-lightbox-backdrop" id="imgLightboxBackdrop" aria-label="Close viewer"></button>
<div class="img-lightbox-panel">
<button type="button" class="img-lightbox-close" id="imgLightboxClose" aria-label="Close">×</button>
<div class="img-lightbox-toolbar">
<button type="button" id="lbZoomOut" title="Zoom out" aria-label="Zoom out"></button>
<button type="button" id="lbZoomReset" title="Reset zoom" aria-label="Reset zoom">100%</button>
<button type="button" id="lbZoomIn" title="Zoom in" aria-label="Zoom in">+</button>
</div>
<div class="img-lightbox-scroll" id="imgLightboxScroll">
<div class="img-lightbox-stage" id="imgLightboxStage">
<img id="imgLightboxImg" src="" alt="" decoding="async" />
</div>
</div>
<p class="img-lightbox-caption">
<span id="imgLightboxCaption"></span>
<span class="img-lightbox-hint">Scroll to pan · +/− or Ctrl+scroll to zoom · dark area or Esc to close</span>
</p>
</div>
</div>
<script>
(function () {
var btn = document.getElementById("btnOpenTab");
if (btn) {
btn.addEventListener("click", function () {
try {
window.open(window.location.href, "_blank", "noopener,noreferrer");
} catch (e) {
window.location.href = window.location.href;
}
});
}
})();
const sessionId = "demo-session";
function setLoading(which, on) {
var el = document.getElementById(which);
if (!el) return;
el.disabled = on;
if (on && !el.dataset.label) el.dataset.label = el.textContent;
el.textContent = on ? "Please wait…" : (el.dataset.label || el.textContent);
}
async function resetTask() {
setLoading("btnReset", true);
try {
const taskId = document.getElementById("taskId").value;
const resp = await fetch("/reset", {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-Session-Id": sessionId
},
body: JSON.stringify({ task_id: taskId })
});
const data = await resp.json();
document.getElementById("observation").textContent = JSON.stringify(data, null, 2);
const broken = data && data.observation && data.observation.original_query;
document.getElementById("query").value = broken || "";
} finally {
setLoading("btnReset", false);
}
}
async function submitQuery() {
setLoading("btnSubmit", true);
try {
const query = document.getElementById("query").value;
const payload = {
action: {
action_type: "submit_query",
query: query
}
};
const resp = await fetch("/step", {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-Session-Id": sessionId
},
body: JSON.stringify(payload)
});
const data = await resp.json();
document.getElementById("result").textContent = JSON.stringify(data, null, 2);
} finally {
setLoading("btnSubmit", false);
}
}
(function imageLightbox() {
var main = document.querySelector("main");
var lb = document.getElementById("imgLightbox");
var stage = document.getElementById("imgLightboxStage");
var lbImg = document.getElementById("imgLightboxImg");
var cap = document.getElementById("imgLightboxCaption");
var scrollEl = document.getElementById("imgLightboxScroll");
var closeBtn = document.getElementById("imgLightboxClose");
var backdrop = document.getElementById("imgLightboxBackdrop");
var zIn = document.getElementById("lbZoomIn");
var zOut = document.getElementById("lbZoomOut");
var zReset = document.getElementById("lbZoomReset");
if (!main || !lb || !stage || !lbImg || !scrollEl) return;
var scale = 1;
function applyZoomedSize() {
var nw = lbImg.naturalWidth;
var nh = lbImg.naturalHeight;
if (!nw || !nh) return;
lbImg.style.width = nw * scale + "px";
lbImg.style.height = nh * scale + "px";
}
function clearZoomedSize() {
lbImg.style.width = "";
lbImg.style.height = "";
}
function setScale(next) {
scale = Math.min(4, Math.max(0.25, next));
applyZoomedSize();
}
function centerScroll() {
var el = scrollEl;
el.scrollLeft = Math.max(0, (el.scrollWidth - el.clientWidth) / 2);
el.scrollTop = Math.max(0, (el.scrollHeight - el.clientHeight) / 2);
}
function openFrom(thumb) {
clearZoomedSize();
lbImg.removeAttribute("src");
scale = 1;
lbImg.onload = function () {
lbImg.onload = null;
applyZoomedSize();
requestAnimationFrame(function () {
centerScroll();
closeBtn.focus();
});
};
lbImg.src = thumb.currentSrc || thumb.src;
lbImg.alt = thumb.getAttribute("alt") || "";
var fig = thumb.closest("figure");
var fc = fig && fig.querySelector("figcaption");
var capText = fc ? fc.textContent.replace(/\s+/g, " ").trim() : "";
if (!capText) {
capText = (thumb.getAttribute("alt") || "Image").trim();
if (capText.length > 140) capText = capText.slice(0, 137) + "…";
}
cap.textContent = capText;
lb.hidden = false;
lb.setAttribute("aria-hidden", "false");
document.body.style.overflow = "hidden";
if (lbImg.complete && lbImg.naturalWidth) {
lbImg.onload();
}
}
function closeLb() {
lb.hidden = true;
lb.setAttribute("aria-hidden", "true");
lbImg.onload = null;
lbImg.removeAttribute("src");
clearZoomedSize();
scale = 1;
document.body.style.overflow = "";
}
main.addEventListener("click", function (ev) {
var t = ev.target;
if (t && t.tagName === "IMG" && t.classList.contains("sde-zoomable")) {
ev.preventDefault();
openFrom(t);
}
});
closeBtn.addEventListener("click", closeLb);
backdrop.addEventListener("click", closeLb);
zIn.addEventListener("click", function () { setScale(scale * 1.25); });
zOut.addEventListener("click", function () { setScale(scale / 1.25); });
zReset.addEventListener("click", function () { setScale(1); centerScroll(); });
lb.addEventListener("wheel", function (ev) {
if (lb.hidden) return;
if (!ev.ctrlKey && !ev.metaKey) return;
ev.preventDefault();
setScale(scale * (ev.deltaY < 0 ? 1.1 : 0.9));
}, { passive: false });
document.addEventListener("keydown", function (ev) {
if (lb.hidden) return;
if (ev.key === "Escape") {
ev.preventDefault();
closeLb();
}
});
})();
</script>
</body>
</html>