NorthernTribe-Research commited on
Commit
4647b37
verified
1 Parent(s): 668cc30

Add live tactical training visualization panel and UI telemetry stream.

Browse files
Files changed (2) hide show
  1. README.md +1 -0
  2. app.py +351 -5
README.md CHANGED
@@ -42,6 +42,7 @@ If no token is available, public dataset training still works and push is automa
42
  - `Run Evaluation After Training`: toggles post-train eval in runtime config.
43
  - `Enforce Quality Gate`: enables/disables promotion gate checks.
44
  - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
 
45
  - `Preflight Only (No Training)`: validates pipeline with `--dry-run`.
46
  - `Force Dataset Redownload`: bypasses cached parquet files.
47
  - `Abort Active Run`: cancels active subprocess tree.
 
42
  - `Run Evaluation After Training`: toggles post-train eval in runtime config.
43
  - `Enforce Quality Gate`: enables/disables promotion gate checks.
44
  - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
45
+ - `Live Tactical Telemetry`: real-time stage progression, runtime posture, loss sparkline, and gate/push state.
46
  - `Preflight Only (No Training)`: validates pipeline with `--dry-run`.
47
  - `Force Dataset Redownload`: bypasses cached parquet files.
48
  - `Abort Active Run`: cancels active subprocess tree.
app.py CHANGED
@@ -4,6 +4,7 @@
4
  from __future__ import annotations
5
 
6
  import datetime as dt
 
7
  import inspect
8
  import json
9
  import os
@@ -37,6 +38,8 @@ CREDENTIAL_FILE_CANDIDATES = [
37
  ]
38
 
39
  REPO_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,95}/[A-Za-z0-9][A-Za-z0-9._-]{0,95}$")
 
 
40
 
41
  RUN_STATE_LOCK = threading.Lock()
42
  RUN_IN_PROGRESS = False
@@ -180,6 +183,108 @@ TACTICAL_CSS = """
180
  letter-spacing: 0.12em;
181
  text-transform: uppercase;
182
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  """
184
 
185
  TACTICAL_HEADER_HTML = """
@@ -268,6 +373,207 @@ def summary_text(summary: Dict[str, Any]) -> str:
268
  return json.dumps(summary, ensure_ascii=True, indent=2)
269
 
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  def _token_from_credentials_file(path: Path) -> Optional[str]:
272
  try:
273
  data = json.loads(path.read_text(encoding="utf-8"))
@@ -560,15 +866,15 @@ def make_copyable_textbox(
560
  return gr.Textbox(**textbox_kwargs)
561
 
562
 
563
- def clear_outputs() -> Tuple[str, str, str]:
564
- return "", "Idle", ""
565
 
566
 
567
  def cancel_pipeline() -> str:
568
  return request_cancel()
569
 
570
 
571
- def run_pipeline(
572
  dataset_repo_id: str,
573
  model_repo_id: str,
574
  base_model_id: str,
@@ -920,6 +1226,45 @@ def run_pipeline(
920
  finish_run()
921
 
922
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
923
  with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
924
  gr.HTML(TACTICAL_HEADER_HTML)
925
  gr.Markdown(PROJECT_DESCRIPTION)
@@ -983,6 +1328,7 @@ with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
983
  stop_button = gr.Button("Abort Active Run", variant="stop")
984
  clear_button = gr.Button("Reset Console")
985
 
 
986
  status = gr.Textbox(label="Run Status", value="Idle", interactive=False)
987
  logs = make_copyable_textbox(label="Telemetry Log", lines=24, max_lines=30, interactive=False)
988
  run_summary = make_copyable_textbox(
@@ -1011,10 +1357,10 @@ with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
1011
  force_redownload,
1012
  preflight_only,
1013
  ],
1014
- outputs=[logs, status, run_summary],
1015
  )
1016
  stop_button.click(fn=cancel_pipeline, inputs=None, outputs=[status], queue=False)
1017
- clear_button.click(fn=clear_outputs, inputs=None, outputs=[logs, status, run_summary], queue=False)
1018
 
1019
 
1020
  if __name__ == "__main__":
 
4
  from __future__ import annotations
5
 
6
  import datetime as dt
7
+ import html
8
  import inspect
9
  import json
10
  import os
 
38
  ]
39
 
40
  REPO_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,95}/[A-Za-z0-9][A-Za-z0-9._-]{0,95}$")
41
+ STAGE_LOG_RE = re.compile(r"\[stage\s+(\d+)\]")
42
+ LOSS_LOG_RE = re.compile(r"(?:^|[\s{,'\"])(?:loss|train_loss)\s*[:=]\s*([-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?)")
43
 
44
  RUN_STATE_LOCK = threading.Lock()
45
  RUN_IN_PROGRESS = False
 
183
  letter-spacing: 0.12em;
184
  text-transform: uppercase;
185
  }
186
+
187
+ .ops-visual {
188
+ border: 1px solid var(--ops-border);
189
+ background: linear-gradient(180deg, #101010 0%, #0b0b0b 100%);
190
+ padding: 12px;
191
+ }
192
+
193
+ .ops-visual-head {
194
+ display: flex;
195
+ justify-content: space-between;
196
+ align-items: center;
197
+ margin-bottom: 10px;
198
+ gap: 10px;
199
+ }
200
+
201
+ .ops-visual-title {
202
+ font-family: "Rajdhani", "IBM Plex Mono", monospace;
203
+ font-weight: 700;
204
+ letter-spacing: 0.14em;
205
+ text-transform: uppercase;
206
+ color: #f1f1f1;
207
+ }
208
+
209
+ .ops-visual-sub {
210
+ color: #9f9f9f;
211
+ font-size: 0.78rem;
212
+ letter-spacing: 0.08em;
213
+ text-transform: uppercase;
214
+ }
215
+
216
+ .ops-grid {
217
+ display: grid;
218
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
219
+ gap: 10px;
220
+ }
221
+
222
+ .ops-card {
223
+ border: 1px solid #323232;
224
+ background: linear-gradient(180deg, #161616 0%, #101010 100%);
225
+ padding: 9px;
226
+ min-height: 72px;
227
+ }
228
+
229
+ .ops-k {
230
+ color: #9a9a9a;
231
+ font-size: 0.68rem;
232
+ letter-spacing: 0.11em;
233
+ text-transform: uppercase;
234
+ }
235
+
236
+ .ops-v {
237
+ color: #f0f0f0;
238
+ font-family: "Rajdhani", "IBM Plex Mono", monospace;
239
+ font-size: 1.05rem;
240
+ margin-top: 5px;
241
+ letter-spacing: 0.05em;
242
+ }
243
+
244
+ .ops-v-small {
245
+ color: #d1d1d1;
246
+ font-size: 0.83rem;
247
+ margin-top: 4px;
248
+ }
249
+
250
+ .ops-meter {
251
+ margin-top: 8px;
252
+ width: 100%;
253
+ height: 8px;
254
+ border: 1px solid #383838;
255
+ background: #111111;
256
+ position: relative;
257
+ overflow: hidden;
258
+ }
259
+
260
+ .ops-meter-fill {
261
+ position: absolute;
262
+ left: 0;
263
+ top: 0;
264
+ bottom: 0;
265
+ background: linear-gradient(90deg, #bdbdbd 0%, #f0f0f0 100%);
266
+ }
267
+
268
+ .ops-spark {
269
+ margin-top: 8px;
270
+ border: 1px solid #343434;
271
+ background: #0e0e0e;
272
+ padding: 3px;
273
+ }
274
+
275
+ .ops-spark svg {
276
+ width: 100%;
277
+ height: 74px;
278
+ display: block;
279
+ }
280
+
281
+ .ops-foot {
282
+ margin-top: 10px;
283
+ color: #8f8f8f;
284
+ font-size: 0.74rem;
285
+ letter-spacing: 0.08em;
286
+ text-transform: uppercase;
287
+ }
288
  """
289
 
290
  TACTICAL_HEADER_HTML = """
 
373
  return json.dumps(summary, ensure_ascii=True, indent=2)
374
 
375
 
376
+ def _as_dict(value: Any) -> Dict[str, Any]:
377
+ return value if isinstance(value, dict) else {}
378
+
379
+
380
+ def _parse_summary_json(text: str) -> Dict[str, Any]:
381
+ if not text:
382
+ return {}
383
+ try:
384
+ parsed = json.loads(text)
385
+ except json.JSONDecodeError:
386
+ return {}
387
+ return parsed if isinstance(parsed, dict) else {}
388
+
389
+
390
+ def _fmt_pct(value: Any) -> str:
391
+ try:
392
+ return f"{float(value) * 100:.1f}%"
393
+ except (TypeError, ValueError):
394
+ return "--"
395
+
396
+
397
+ def _fmt_float(value: Any, digits: int = 3) -> str:
398
+ try:
399
+ return f"{float(value):.{digits}f}"
400
+ except (TypeError, ValueError):
401
+ return "--"
402
+
403
+
404
+ def _extract_loss_values(log_text: str, limit: int = 48) -> List[float]:
405
+ losses: List[float] = []
406
+ for line in log_text.splitlines():
407
+ lower = line.lower()
408
+ if "eval_loss" in lower:
409
+ continue
410
+ match = LOSS_LOG_RE.search(lower)
411
+ if match is None:
412
+ continue
413
+ try:
414
+ value = float(match.group(1))
415
+ except (TypeError, ValueError):
416
+ continue
417
+ if not (value >= 0.0):
418
+ continue
419
+ losses.append(value)
420
+ if len(losses) > limit:
421
+ losses = losses[-limit:]
422
+ return losses
423
+
424
+
425
+ def _build_loss_sparkline(losses: List[float]) -> str:
426
+ if not losses:
427
+ return "<div class='ops-v-small'>No live loss points yet.</div>"
428
+ width = 520
429
+ height = 74
430
+ pad = 5
431
+ min_v = min(losses)
432
+ max_v = max(losses)
433
+ span = max(max_v - min_v, 1e-9)
434
+
435
+ points: List[str] = []
436
+ for idx, value in enumerate(losses):
437
+ x = pad + (idx * (width - 2 * pad) / max(1, len(losses) - 1))
438
+ y = pad + ((max_v - value) * (height - 2 * pad) / span)
439
+ points.append(f"{x:.2f},{y:.2f}")
440
+ polyline = " ".join(points)
441
+ latest = losses[-1]
442
+ return (
443
+ f"<div class='ops-v-small'>Latest train loss: <strong>{_fmt_float(latest, 4)}</strong></div>"
444
+ "<div class='ops-spark'>"
445
+ f"<svg viewBox='0 0 {width} {height}' preserveAspectRatio='none'>"
446
+ f"<polyline points='{polyline}' fill='none' stroke='#f0f0f0' stroke-width='2' />"
447
+ "</svg>"
448
+ "</div>"
449
+ )
450
+
451
+
452
+ def _infer_stage_snapshot(summary: Dict[str, Any], log_text: str) -> Dict[str, Any]:
453
+ start_stage = max(1, _safe_int(summary.get("start_stage"), 1))
454
+ stage_count = max(1, _safe_int(summary.get("max_stages"), TEMPLATE_STAGE_COUNT))
455
+ completed = 0
456
+
457
+ training_summary = _as_dict(summary.get("training_summary"))
458
+ stages_ran = training_summary.get("stages_ran")
459
+ if isinstance(stages_ran, list):
460
+ completed = min(stage_count, len(stages_ran))
461
+
462
+ active_stage = None
463
+ for line in reversed(log_text.splitlines()[-350:]):
464
+ match = STAGE_LOG_RE.search(line)
465
+ if match:
466
+ active_stage = _safe_int(match.group(1), 0)
467
+ break
468
+
469
+ if completed >= stage_count:
470
+ progress = 1.0
471
+ else:
472
+ progress = completed / stage_count
473
+ if active_stage and active_stage >= start_stage:
474
+ relative_active = (active_stage - start_stage) + 0.35
475
+ progress = max(progress, min(1.0, relative_active / stage_count))
476
+
477
+ return {
478
+ "start_stage": start_stage,
479
+ "stage_count": stage_count,
480
+ "completed": completed,
481
+ "active_stage": active_stage,
482
+ "progress": max(0.0, min(1.0, progress)),
483
+ }
484
+
485
+
486
+ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str) -> str:
487
+ safe_summary = _as_dict(summary)
488
+ runtime = _as_dict(safe_summary.get("runtime"))
489
+ quality_gate = _as_dict(safe_summary.get("quality_gate"))
490
+ evaluation = _as_dict(safe_summary.get("evaluation"))
491
+ push_report = _as_dict(safe_summary.get("push"))
492
+
493
+ run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
494
+ status_value = html.escape(status_text or "Idle")
495
+ runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU / PRECHECK"
496
+ runtime_mode = html.escape(runtime_mode)
497
+ device_count = _safe_int(runtime.get("cuda_device_count"), 0)
498
+
499
+ gate_enabled = bool(quality_gate.get("enabled"))
500
+ gate_passed = quality_gate.get("passed")
501
+ if not gate_enabled:
502
+ gate_text = "Disabled"
503
+ elif gate_passed is True:
504
+ gate_text = "Passed"
505
+ elif gate_passed is False:
506
+ gate_text = "Failed"
507
+ else:
508
+ gate_text = "Pending"
509
+
510
+ stage_meta = _infer_stage_snapshot(safe_summary, log_text)
511
+ progress_pct = int(stage_meta["progress"] * 100)
512
+ active_stage = stage_meta.get("active_stage")
513
+ stage_hint = f"active stage {active_stage}" if active_stage else "awaiting stage telemetry"
514
+ stage_hint = html.escape(stage_hint)
515
+
516
+ losses = _extract_loss_values(log_text)
517
+ sparkline_html = _build_loss_sparkline(losses)
518
+
519
+ pass_k = _fmt_pct(evaluation.get("pass_at_k"))
520
+ pass_1 = _fmt_pct(evaluation.get("pass_at_1"))
521
+ exact_k = _fmt_pct(evaluation.get("exact_at_k"))
522
+
523
+ push_state = "Pending"
524
+ if push_report:
525
+ requested = bool(push_report.get("requested"))
526
+ performed = bool(push_report.get("performed"))
527
+ if not requested:
528
+ push_state = "Not requested"
529
+ elif performed:
530
+ push_state = "Published"
531
+ else:
532
+ push_state = "Blocked"
533
+
534
+ return f"""
535
+ <div class="ops-visual">
536
+ <div class="ops-visual-head">
537
+ <div class="ops-visual-title">Live Tactical Telemetry</div>
538
+ <div class="ops-visual-sub">Monochrome Ops Feed</div>
539
+ </div>
540
+ <div class="ops-grid">
541
+ <div class="ops-card">
542
+ <div class="ops-k">Run</div>
543
+ <div class="ops-v">{run_label}</div>
544
+ <div class="ops-v-small">{status_value}</div>
545
+ </div>
546
+ <div class="ops-card">
547
+ <div class="ops-k">Runtime</div>
548
+ <div class="ops-v">{runtime_mode}</div>
549
+ <div class="ops-v-small">cuda devices: {device_count}</div>
550
+ </div>
551
+ <div class="ops-card">
552
+ <div class="ops-k">Stage Progress</div>
553
+ <div class="ops-v">{stage_meta['completed']} / {stage_meta['stage_count']}</div>
554
+ <div class="ops-v-small">{stage_hint}</div>
555
+ <div class="ops-meter"><div class="ops-meter-fill" style="width:{progress_pct}%"></div></div>
556
+ </div>
557
+ <div class="ops-card">
558
+ <div class="ops-k">Quality Gate</div>
559
+ <div class="ops-v">{html.escape(gate_text)}</div>
560
+ <div class="ops-v-small">push: {html.escape(push_state)}</div>
561
+ </div>
562
+ <div class="ops-card">
563
+ <div class="ops-k">Eval pass@k</div>
564
+ <div class="ops-v">{pass_k}</div>
565
+ <div class="ops-v-small">pass@1 {pass_1} | exact@k {exact_k}</div>
566
+ </div>
567
+ <div class="ops-card">
568
+ <div class="ops-k">Loss Stream</div>
569
+ {sparkline_html}
570
+ </div>
571
+ </div>
572
+ <div class="ops-foot">dull tactical theme 路 black / grey / white 路 anduril/palantir-inspired operations console</div>
573
+ </div>
574
+ """.strip()
575
+
576
+
577
  def _token_from_credentials_file(path: Path) -> Optional[str]:
578
  try:
579
  data = json.loads(path.read_text(encoding="utf-8"))
 
866
  return gr.Textbox(**textbox_kwargs)
867
 
868
 
869
+ def clear_outputs() -> Tuple[str, str, str, str]:
870
+ return "", "Idle", "", render_ops_visual({}, "Idle", "")
871
 
872
 
873
  def cancel_pipeline() -> str:
874
  return request_cancel()
875
 
876
 
877
+ def run_pipeline_core(
878
  dataset_repo_id: str,
879
  model_repo_id: str,
880
  base_model_id: str,
 
1226
  finish_run()
1227
 
1228
 
1229
+ def run_pipeline(
1230
+ dataset_repo_id: str,
1231
+ model_repo_id: str,
1232
+ base_model_id: str,
1233
+ start_stage: int,
1234
+ max_stages: int,
1235
+ run_eval: bool,
1236
+ eval_k: int,
1237
+ eval_samples: int,
1238
+ enforce_quality_gate: bool,
1239
+ gate_min_pass_at_1: float,
1240
+ gate_min_pass_at_k: float,
1241
+ gate_min_rows: int,
1242
+ push_to_hub: bool,
1243
+ force_redownload: bool,
1244
+ preflight_only: bool,
1245
+ ) -> Generator[Tuple[str, str, str, str], None, None]:
1246
+ pipeline = run_pipeline_core(
1247
+ dataset_repo_id=dataset_repo_id,
1248
+ model_repo_id=model_repo_id,
1249
+ base_model_id=base_model_id,
1250
+ start_stage=start_stage,
1251
+ max_stages=max_stages,
1252
+ run_eval=run_eval,
1253
+ eval_k=eval_k,
1254
+ eval_samples=eval_samples,
1255
+ enforce_quality_gate=enforce_quality_gate,
1256
+ gate_min_pass_at_1=gate_min_pass_at_1,
1257
+ gate_min_pass_at_k=gate_min_pass_at_k,
1258
+ gate_min_rows=gate_min_rows,
1259
+ push_to_hub=push_to_hub,
1260
+ force_redownload=force_redownload,
1261
+ preflight_only=preflight_only,
1262
+ )
1263
+ for logs_text, status_text, summary_json in pipeline:
1264
+ summary = _parse_summary_json(summary_json)
1265
+ yield logs_text, status_text, summary_json, render_ops_visual(summary, status_text, logs_text)
1266
+
1267
+
1268
  with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
1269
  gr.HTML(TACTICAL_HEADER_HTML)
1270
  gr.Markdown(PROJECT_DESCRIPTION)
 
1328
  stop_button = gr.Button("Abort Active Run", variant="stop")
1329
  clear_button = gr.Button("Reset Console")
1330
 
1331
+ ops_visual = gr.HTML(value=render_ops_visual({}, "Idle", ""))
1332
  status = gr.Textbox(label="Run Status", value="Idle", interactive=False)
1333
  logs = make_copyable_textbox(label="Telemetry Log", lines=24, max_lines=30, interactive=False)
1334
  run_summary = make_copyable_textbox(
 
1357
  force_redownload,
1358
  preflight_only,
1359
  ],
1360
+ outputs=[logs, status, run_summary, ops_visual],
1361
  )
1362
  stop_button.click(fn=cancel_pipeline, inputs=None, outputs=[status], queue=False)
1363
+ clear_button.click(fn=clear_outputs, inputs=None, outputs=[logs, status, run_summary, ops_visual], queue=False)
1364
 
1365
 
1366
  if __name__ == "__main__":