Upload 13 files
Browse files- server/app.py +51 -24
server/app.py
CHANGED
|
@@ -223,25 +223,45 @@ def _dashboard_html() -> str:
|
|
| 223 |
<h2>Training evidence</h2>
|
| 224 |
<p class='sub'>
|
| 225 |
Committed artifacts from the reference training run
|
| 226 |
-
(Qwen2.5-1.5B-Instruct, 8 episodes/task, 3 epochs)
|
|
|
|
| 227 |
</p>
|
| 228 |
<div class='plots'>
|
| 229 |
<figure>
|
| 230 |
-
<
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
| 234 |
</figure>
|
| 235 |
<figure>
|
| 236 |
-
<
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
| 239 |
</figure>
|
| 240 |
<figure>
|
| 241 |
-
<
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
</figure>
|
| 246 |
</div>
|
| 247 |
<p class='sub' style='margin-top:0.75rem'>
|
|
@@ -250,9 +270,7 @@ def _dashboard_html() -> str:
|
|
| 250 |
·
|
| 251 |
<a href='/artifacts/training_log.json'>training_log.json</a>
|
| 252 |
·
|
| 253 |
-
<a href='/artifacts/
|
| 254 |
-
·
|
| 255 |
-
<a href='/artifacts/summary_metrics_qwen0p5b.json'>0.5B metrics</a>
|
| 256 |
</p>
|
| 257 |
""".format(hard=_fmt(headline_delta))
|
| 258 |
else:
|
|
@@ -358,19 +376,19 @@ def _dashboard_html() -> str:
|
|
| 358 |
background: radial-gradient(1000px 600px at 10% -10%, #1e293b, var(--bg));
|
| 359 |
color: var(--text); padding: 2rem; margin: 0; min-height: 100vh;
|
| 360 |
}}
|
| 361 |
-
header {{ display:flex; align-items:center; justify-content:space-between; max-width:
|
| 362 |
.brand {{ display:flex; align-items:center; gap:0.75rem; }}
|
| 363 |
.logo {{ width:44px; height:44px; border-radius:10px; background:linear-gradient(135deg,var(--primary),var(--accent)); }}
|
| 364 |
h1 {{ font-size:1.6rem; margin:0; }}
|
| 365 |
-
h2 {{ font-size:1.
|
| 366 |
.sub {{ color: var(--muted); }}
|
| 367 |
-
.grid {{ display:grid; grid-template-columns: repeat(auto-fit,minmax(240px,1fr)); gap:1rem; max-width:
|
| 368 |
.grid-3 {{ grid-template-columns: repeat(auto-fit,minmax(280px,1fr)); }}
|
| 369 |
.card {{ background: var(--card); border: 1px solid #1f2a44; padding: 1.25rem; border-radius: 14px; }}
|
| 370 |
.card h3 {{ margin:0 0 0.5rem; font-size:1rem; color:#f1f5f9; }}
|
| 371 |
.pill {{ display:inline-block; padding:2px 8px; margin:2px; border-radius:999px; background:#1e293b; border:1px solid #334155; color:#cbd5e1; font-size:0.78rem; }}
|
| 372 |
.pill.cta {{ background:linear-gradient(135deg,var(--primary),var(--accent)); color:#0b1225; border-color:transparent; font-weight:600; }}
|
| 373 |
-
.container {{ max-width:
|
| 374 |
code {{ background:#0b1225; border:1px solid #1f2a44; padding:2px 6px; border-radius:6px; color:#67e8f9; font-family:'JetBrains Mono', monospace; }}
|
| 375 |
pre {{ background:#0b1225; border:1px solid #1f2a44; padding: 1rem; border-radius: 10px; color:#cbd5e1; overflow-x:auto; font-size:0.85rem; }}
|
| 376 |
a {{ color: var(--accent); text-decoration: none; }}
|
|
@@ -379,11 +397,20 @@ def _dashboard_html() -> str:
|
|
| 379 |
.kpi .num {{ font-size:1.6rem; font-weight:700; color:#f8fafc; }}
|
| 380 |
.kpi .lbl {{ color: var(--muted); font-size:0.8rem; }}
|
| 381 |
.kpi .num.good {{ color: var(--good); }}
|
| 382 |
-
footer {{ max-width:
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
.plots
|
| 386 |
-
.plots
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
.table-wrap {{ overflow-x:auto; }}
|
| 388 |
table {{ width:100%; border-collapse: collapse; margin-top:0.5rem; font-size:0.9rem; }}
|
| 389 |
th, td {{ padding:0.5rem 0.75rem; text-align:left; border-bottom:1px solid #1f2a44; }}
|
|
|
|
| 223 |
<h2>Training evidence</h2>
|
| 224 |
<p class='sub'>
|
| 225 |
Committed artifacts from the reference training run
|
| 226 |
+
(Qwen2.5-1.5B-Instruct, 8 episodes/task, 3 epochs) plus the
|
| 227 |
+
Qwen2.5-0.5B-Instruct ablation. Click any plot to open it full-size.
|
| 228 |
</p>
|
| 229 |
<div class='plots'>
|
| 230 |
<figure>
|
| 231 |
+
<a href='/artifacts/reward_curve.png' target='_blank' rel='noopener'>
|
| 232 |
+
<img src='/artifacts/reward_curve.png' alt='Reward curve by policy (1.5B)' loading='lazy' />
|
| 233 |
+
</a>
|
| 234 |
+
<figcaption><strong>1.5B reward curve.</strong> Mean episodic reward per task tier
|
| 235 |
+
across Random / Heuristic / Base-LLM / SFT-LLM. SFT matches the heuristic
|
| 236 |
+
demonstrator across every tier and outperforms the untuned base by
|
| 237 |
+
<strong>+{hard}</strong> on hard incidents.</figcaption>
|
| 238 |
</figure>
|
| 239 |
<figure>
|
| 240 |
+
<a href='/artifacts/training_curve.png' target='_blank' rel='noopener'>
|
| 241 |
+
<img src='/artifacts/training_curve.png' alt='SFT training loss and token accuracy (1.5B)' loading='lazy' />
|
| 242 |
+
</a>
|
| 243 |
+
<figcaption><strong>1.5B training curve.</strong> Supervised loss collapses from
|
| 244 |
+
<code>~2.84 → ~0.02</code> and next-token accuracy climbs from
|
| 245 |
+
<code>~0.49 → ~0.99</code> over three epochs on 680 rollout tokens.</figcaption>
|
| 246 |
</figure>
|
| 247 |
<figure>
|
| 248 |
+
<a href='/artifacts/reward_components.png' target='_blank' rel='noopener'>
|
| 249 |
+
<img src='/artifacts/reward_components.png' alt='Reward component decomposition (1.5B)' loading='lazy' />
|
| 250 |
+
</a>
|
| 251 |
+
<figcaption><strong>1.5B reward-component breakdown.</strong> SFT reproduces the
|
| 252 |
+
heuristic's positive components (<code>clue_bonus</code>,
|
| 253 |
+
<code>mitigation_correct</code>, <code>closure_correct</code>,
|
| 254 |
+
<code>speed_bonus</code>) while the base model stalls on
|
| 255 |
+
<code>step_cost</code> and SLA penalties.</figcaption>
|
| 256 |
+
</figure>
|
| 257 |
+
<figure>
|
| 258 |
+
<a href='/artifacts/reward_curve_qwen0p5b.png' target='_blank' rel='noopener'>
|
| 259 |
+
<img src='/artifacts/reward_curve_qwen0p5b.png' alt='Reward curve by policy (0.5B ablation)' loading='lazy' />
|
| 260 |
+
</a>
|
| 261 |
+
<figcaption><strong>0.5B ablation reward curve.</strong> Same pipeline, smaller
|
| 262 |
+
backbone. SFT improves by only <strong>+0.43 / +0.14 / +0.00</strong> over base —
|
| 263 |
+
the 0.5B model is too small to absorb the multi-step, role-gated policy.
|
| 264 |
+
Scale is the story.</figcaption>
|
| 265 |
</figure>
|
| 266 |
</div>
|
| 267 |
<p class='sub' style='margin-top:0.75rem'>
|
|
|
|
| 270 |
·
|
| 271 |
<a href='/artifacts/training_log.json'>training_log.json</a>
|
| 272 |
·
|
| 273 |
+
<a href='/artifacts/summary_metrics_qwen0p5b.json'>summary_metrics_qwen0p5b.json</a>
|
|
|
|
|
|
|
| 274 |
</p>
|
| 275 |
""".format(hard=_fmt(headline_delta))
|
| 276 |
else:
|
|
|
|
| 376 |
background: radial-gradient(1000px 600px at 10% -10%, #1e293b, var(--bg));
|
| 377 |
color: var(--text); padding: 2rem; margin: 0; min-height: 100vh;
|
| 378 |
}}
|
| 379 |
+
header {{ display:flex; align-items:center; justify-content:space-between; max-width:1200px; margin:0 auto 1.5rem; flex-wrap:wrap; gap:1rem; }}
|
| 380 |
.brand {{ display:flex; align-items:center; gap:0.75rem; }}
|
| 381 |
.logo {{ width:44px; height:44px; border-radius:10px; background:linear-gradient(135deg,var(--primary),var(--accent)); }}
|
| 382 |
h1 {{ font-size:1.6rem; margin:0; }}
|
| 383 |
+
h2 {{ font-size:1.25rem; margin:1.8rem 0 0.6rem; color:#cbd5e1; }}
|
| 384 |
.sub {{ color: var(--muted); }}
|
| 385 |
+
.grid {{ display:grid; grid-template-columns: repeat(auto-fit,minmax(240px,1fr)); gap:1rem; max-width:1200px; margin:0 auto; }}
|
| 386 |
.grid-3 {{ grid-template-columns: repeat(auto-fit,minmax(280px,1fr)); }}
|
| 387 |
.card {{ background: var(--card); border: 1px solid #1f2a44; padding: 1.25rem; border-radius: 14px; }}
|
| 388 |
.card h3 {{ margin:0 0 0.5rem; font-size:1rem; color:#f1f5f9; }}
|
| 389 |
.pill {{ display:inline-block; padding:2px 8px; margin:2px; border-radius:999px; background:#1e293b; border:1px solid #334155; color:#cbd5e1; font-size:0.78rem; }}
|
| 390 |
.pill.cta {{ background:linear-gradient(135deg,var(--primary),var(--accent)); color:#0b1225; border-color:transparent; font-weight:600; }}
|
| 391 |
+
.container {{ max-width: 1200px; margin: 0 auto; }}
|
| 392 |
code {{ background:#0b1225; border:1px solid #1f2a44; padding:2px 6px; border-radius:6px; color:#67e8f9; font-family:'JetBrains Mono', monospace; }}
|
| 393 |
pre {{ background:#0b1225; border:1px solid #1f2a44; padding: 1rem; border-radius: 10px; color:#cbd5e1; overflow-x:auto; font-size:0.85rem; }}
|
| 394 |
a {{ color: var(--accent); text-decoration: none; }}
|
|
|
|
| 397 |
.kpi .num {{ font-size:1.6rem; font-weight:700; color:#f8fafc; }}
|
| 398 |
.kpi .lbl {{ color: var(--muted); font-size:0.8rem; }}
|
| 399 |
.kpi .num.good {{ color: var(--good); }}
|
| 400 |
+
footer {{ max-width:1200px; margin:2rem auto 0; color:var(--muted); font-size:0.85rem; }}
|
| 401 |
+
/* Training-evidence plots: one plot per row, full content width,
|
| 402 |
+
so dense charts (reward curves, stacked bars) stay readable. */
|
| 403 |
+
.plots {{ display:flex; flex-direction:column; gap:1.5rem; max-width:1200px; margin:0 auto; }}
|
| 404 |
+
.plots figure {{ background: var(--card); border:1px solid #1f2a44; border-radius: 14px; padding: 1.25rem; margin:0; }}
|
| 405 |
+
.plots figure a {{ display:block; }}
|
| 406 |
+
.plots img {{
|
| 407 |
+
width:100%; height:auto; display:block;
|
| 408 |
+
max-width:1100px; margin:0 auto;
|
| 409 |
+
border-radius:10px; background:#0b1225;
|
| 410 |
+
transition: transform 0.2s ease;
|
| 411 |
+
}}
|
| 412 |
+
.plots img:hover {{ transform: scale(1.01); }}
|
| 413 |
+
.plots figcaption {{ color: var(--muted); font-size:0.9rem; margin-top:0.75rem; line-height:1.55; text-align:center; max-width:1000px; margin-left:auto; margin-right:auto; }}
|
| 414 |
.table-wrap {{ overflow-x:auto; }}
|
| 415 |
table {{ width:100%; border-collapse: collapse; margin-top:0.5rem; font-size:0.9rem; }}
|
| 416 |
th, td {{ padding:0.5rem 0.75rem; text-align:left; border-bottom:1px solid #1f2a44; }}
|