Spaces:

NorthernTribe-Research
/

math_trainer

Running

App Files Files Community

NorthernTribe-Research commited on about 16 hours ago

Commit

4647b37

verified ·

1 Parent(s): 668cc30

Add live tactical training visualization panel and UI telemetry stream.

Browse files

Files changed (2) hide show

README.md +1 -0
app.py +351 -5

README.md CHANGED Viewed

@@ -42,6 +42,7 @@ If no token is available, public dataset training still works and push is automa
 - `Run Evaluation After Training`: toggles post-train eval in runtime config.
 - `Enforce Quality Gate`: enables/disables promotion gate checks.
 - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
 - `Preflight Only (No Training)`: validates pipeline with `--dry-run`.
 - `Force Dataset Redownload`: bypasses cached parquet files.
 - `Abort Active Run`: cancels active subprocess tree.

 - `Run Evaluation After Training`: toggles post-train eval in runtime config.
 - `Enforce Quality Gate`: enables/disables promotion gate checks.
 - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
+- `Live Tactical Telemetry`: real-time stage progression, runtime posture, loss sparkline, and gate/push state.
 - `Preflight Only (No Training)`: validates pipeline with `--dry-run`.
 - `Force Dataset Redownload`: bypasses cached parquet files.
 - `Abort Active Run`: cancels active subprocess tree.

app.py CHANGED Viewed

@@ -4,6 +4,7 @@
 from __future__ import annotations
 import datetime as dt
 import inspect
 import json
 import os
@@ -37,6 +38,8 @@ CREDENTIAL_FILE_CANDIDATES = [
 ]
 REPO_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,95}/[A-Za-z0-9][A-Za-z0-9._-]{0,95}$")
 RUN_STATE_LOCK = threading.Lock()
 RUN_IN_PROGRESS = False
@@ -180,6 +183,108 @@ TACTICAL_CSS = """
   letter-spacing: 0.12em;
   text-transform: uppercase;
 }
 """
 TACTICAL_HEADER_HTML = """
@@ -268,6 +373,207 @@ def summary_text(summary: Dict[str, Any]) -> str:
     return json.dumps(summary, ensure_ascii=True, indent=2)
 def _token_from_credentials_file(path: Path) -> Optional[str]:
     try:
         data = json.loads(path.read_text(encoding="utf-8"))
@@ -560,15 +866,15 @@ def make_copyable_textbox(
     return gr.Textbox(**textbox_kwargs)
-def clear_outputs() -> Tuple[str, str, str]:
-    return "", "Idle", ""
 def cancel_pipeline() -> str:
     return request_cancel()
-def run_pipeline(
     dataset_repo_id: str,
     model_repo_id: str,
     base_model_id: str,
@@ -920,6 +1226,45 @@ def run_pipeline(
         finish_run()
 with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
     gr.HTML(TACTICAL_HEADER_HTML)
     gr.Markdown(PROJECT_DESCRIPTION)
@@ -983,6 +1328,7 @@ with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
         stop_button = gr.Button("Abort Active Run", variant="stop")
         clear_button = gr.Button("Reset Console")
     status = gr.Textbox(label="Run Status", value="Idle", interactive=False)
     logs = make_copyable_textbox(label="Telemetry Log", lines=24, max_lines=30, interactive=False)
     run_summary = make_copyable_textbox(
@@ -1011,10 +1357,10 @@ with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
             force_redownload,
             preflight_only,
         ],
-        outputs=[logs, status, run_summary],
     )
     stop_button.click(fn=cancel_pipeline, inputs=None, outputs=[status], queue=False)
-    clear_button.click(fn=clear_outputs, inputs=None, outputs=[logs, status, run_summary], queue=False)
 if __name__ == "__main__":

 from __future__ import annotations
 import datetime as dt
+import html
 import inspect
 import json
 import os
 ]
 REPO_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,95}/[A-Za-z0-9][A-Za-z0-9._-]{0,95}$")
+STAGE_LOG_RE = re.compile(r"\[stage\s+(\d+)\]")
+LOSS_LOG_RE = re.compile(r"(?:^|[\s{,'\"])(?:loss|train_loss)\s*[:=]\s*([-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?)")
 RUN_STATE_LOCK = threading.Lock()
 RUN_IN_PROGRESS = False
   letter-spacing: 0.12em;
   text-transform: uppercase;
 }
+.ops-visual {
+  border: 1px solid var(--ops-border);
+  background: linear-gradient(180deg, #101010 0%, #0b0b0b 100%);
+  padding: 12px;
+}
+.ops-visual-head {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  margin-bottom: 10px;
+  gap: 10px;
+}
+.ops-visual-title {
+  font-family: "Rajdhani", "IBM Plex Mono", monospace;
+  font-weight: 700;
+  letter-spacing: 0.14em;
+  text-transform: uppercase;
+  color: #f1f1f1;
+}
+.ops-visual-sub {
+  color: #9f9f9f;
+  font-size: 0.78rem;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+.ops-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
+  gap: 10px;
+}
+.ops-card {
+  border: 1px solid #323232;
+  background: linear-gradient(180deg, #161616 0%, #101010 100%);
+  padding: 9px;
+  min-height: 72px;
+}
+.ops-k {
+  color: #9a9a9a;
+  font-size: 0.68rem;
+  letter-spacing: 0.11em;
+  text-transform: uppercase;
+}
+.ops-v {
+  color: #f0f0f0;
+  font-family: "Rajdhani", "IBM Plex Mono", monospace;
+  font-size: 1.05rem;
+  margin-top: 5px;
+  letter-spacing: 0.05em;
+}
+.ops-v-small {
+  color: #d1d1d1;
+  font-size: 0.83rem;
+  margin-top: 4px;
+}
+.ops-meter {
+  margin-top: 8px;
+  width: 100%;
+  height: 8px;
+  border: 1px solid #383838;
+  background: #111111;
+  position: relative;
+  overflow: hidden;
+}
+.ops-meter-fill {
+  position: absolute;
+  left: 0;
+  top: 0;
+  bottom: 0;
+  background: linear-gradient(90deg, #bdbdbd 0%, #f0f0f0 100%);
+}
+.ops-spark {
+  margin-top: 8px;
+  border: 1px solid #343434;
+  background: #0e0e0e;
+  padding: 3px;
+}
+.ops-spark svg {
+  width: 100%;
+  height: 74px;
+  display: block;
+}
+.ops-foot {
+  margin-top: 10px;
+  color: #8f8f8f;
+  font-size: 0.74rem;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
 """
 TACTICAL_HEADER_HTML = """
     return json.dumps(summary, ensure_ascii=True, indent=2)
+def _as_dict(value: Any) -> Dict[str, Any]:
+    return value if isinstance(value, dict) else {}
+def _parse_summary_json(text: str) -> Dict[str, Any]:
+    if not text:
+        return {}
+    try:
+        parsed = json.loads(text)
+    except json.JSONDecodeError:
+        return {}
+    return parsed if isinstance(parsed, dict) else {}
+def _fmt_pct(value: Any) -> str:
+    try:
+        return f"{float(value) * 100:.1f}%"
+    except (TypeError, ValueError):
+        return "--"
+def _fmt_float(value: Any, digits: int = 3) -> str:
+    try:
+        return f"{float(value):.{digits}f}"
+    except (TypeError, ValueError):
+        return "--"
+def _extract_loss_values(log_text: str, limit: int = 48) -> List[float]:
+    losses: List[float] = []
+    for line in log_text.splitlines():
+        lower = line.lower()
+        if "eval_loss" in lower:
+            continue
+        match = LOSS_LOG_RE.search(lower)
+        if match is None:
+            continue
+        try:
+            value = float(match.group(1))
+        except (TypeError, ValueError):
+            continue
+        if not (value >= 0.0):
+            continue
+        losses.append(value)
+    if len(losses) > limit:
+        losses = losses[-limit:]
+    return losses
+def _build_loss_sparkline(losses: List[float]) -> str:
+    if not losses:
+        return "<div class='ops-v-small'>No live loss points yet.</div>"
+    width = 520
+    height = 74
+    pad = 5
+    min_v = min(losses)
+    max_v = max(losses)
+    span = max(max_v - min_v, 1e-9)
+    points: List[str] = []
+    for idx, value in enumerate(losses):
+        x = pad + (idx * (width - 2 * pad) / max(1, len(losses) - 1))
+        y = pad + ((max_v - value) * (height - 2 * pad) / span)
+        points.append(f"{x:.2f},{y:.2f}")
+    polyline = " ".join(points)
+    latest = losses[-1]
+    return (
+        f"<div class='ops-v-small'>Latest train loss: <strong>{_fmt_float(latest, 4)}</strong></div>"
+        "<div class='ops-spark'>"
+        f"<svg viewBox='0 0 {width} {height}' preserveAspectRatio='none'>"
+        f"<polyline points='{polyline}' fill='none' stroke='#f0f0f0' stroke-width='2' />"
+        "</svg>"
+        "</div>"
+    )
+def _infer_stage_snapshot(summary: Dict[str, Any], log_text: str) -> Dict[str, Any]:
+    start_stage = max(1, _safe_int(summary.get("start_stage"), 1))
+    stage_count = max(1, _safe_int(summary.get("max_stages"), TEMPLATE_STAGE_COUNT))
+    completed = 0
+    training_summary = _as_dict(summary.get("training_summary"))
+    stages_ran = training_summary.get("stages_ran")
+    if isinstance(stages_ran, list):
+        completed = min(stage_count, len(stages_ran))
+    active_stage = None
+    for line in reversed(log_text.splitlines()[-350:]):
+        match = STAGE_LOG_RE.search(line)
+        if match:
+            active_stage = _safe_int(match.group(1), 0)
+            break
+    if completed >= stage_count:
+        progress = 1.0
+    else:
+        progress = completed / stage_count
+        if active_stage and active_stage >= start_stage:
+            relative_active = (active_stage - start_stage) + 0.35
+            progress = max(progress, min(1.0, relative_active / stage_count))
+    return {
+        "start_stage": start_stage,
+        "stage_count": stage_count,
+        "completed": completed,
+        "active_stage": active_stage,
+        "progress": max(0.0, min(1.0, progress)),
+    }
+def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str) -> str:
+    safe_summary = _as_dict(summary)
+    runtime = _as_dict(safe_summary.get("runtime"))
+    quality_gate = _as_dict(safe_summary.get("quality_gate"))
+    evaluation = _as_dict(safe_summary.get("evaluation"))
+    push_report = _as_dict(safe_summary.get("push"))
+    run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
+    status_value = html.escape(status_text or "Idle")
+    runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU / PRECHECK"
+    runtime_mode = html.escape(runtime_mode)
+    device_count = _safe_int(runtime.get("cuda_device_count"), 0)
+    gate_enabled = bool(quality_gate.get("enabled"))
+    gate_passed = quality_gate.get("passed")
+    if not gate_enabled:
+        gate_text = "Disabled"
+    elif gate_passed is True:
+        gate_text = "Passed"
+    elif gate_passed is False:
+        gate_text = "Failed"
+    else:
+        gate_text = "Pending"
+    stage_meta = _infer_stage_snapshot(safe_summary, log_text)
+    progress_pct = int(stage_meta["progress"] * 100)
+    active_stage = stage_meta.get("active_stage")
+    stage_hint = f"active stage {active_stage}" if active_stage else "awaiting stage telemetry"
+    stage_hint = html.escape(stage_hint)
+    losses = _extract_loss_values(log_text)
+    sparkline_html = _build_loss_sparkline(losses)
+    pass_k = _fmt_pct(evaluation.get("pass_at_k"))
+    pass_1 = _fmt_pct(evaluation.get("pass_at_1"))
+    exact_k = _fmt_pct(evaluation.get("exact_at_k"))
+    push_state = "Pending"
+    if push_report:
+        requested = bool(push_report.get("requested"))
+        performed = bool(push_report.get("performed"))
+        if not requested:
+            push_state = "Not requested"
+        elif performed:
+            push_state = "Published"
+        else:
+            push_state = "Blocked"
+    return f"""
+<div class="ops-visual">
+  <div class="ops-visual-head">
+    <div class="ops-visual-title">Live Tactical Telemetry</div>
+    <div class="ops-visual-sub">Monochrome Ops Feed</div>
+  </div>
+  <div class="ops-grid">
+    <div class="ops-card">
+      <div class="ops-k">Run</div>
+      <div class="ops-v">{run_label}</div>
+      <div class="ops-v-small">{status_value}</div>
+    </div>
+    <div class="ops-card">
+      <div class="ops-k">Runtime</div>
+      <div class="ops-v">{runtime_mode}</div>
+      <div class="ops-v-small">cuda devices: {device_count}</div>
+    </div>
+    <div class="ops-card">
+      <div class="ops-k">Stage Progress</div>
+      <div class="ops-v">{stage_meta['completed']} / {stage_meta['stage_count']}</div>
+      <div class="ops-v-small">{stage_hint}</div>
+      <div class="ops-meter"><div class="ops-meter-fill" style="width:{progress_pct}%"></div></div>
+    </div>
+    <div class="ops-card">
+      <div class="ops-k">Quality Gate</div>
+      <div class="ops-v">{html.escape(gate_text)}</div>
+      <div class="ops-v-small">push: {html.escape(push_state)}</div>
+    </div>
+    <div class="ops-card">
+      <div class="ops-k">Eval pass@k</div>
+      <div class="ops-v">{pass_k}</div>
+      <div class="ops-v-small">pass@1 {pass_1} | exact@k {exact_k}</div>
+    </div>
+    <div class="ops-card">
+      <div class="ops-k">Loss Stream</div>
+      {sparkline_html}
+    </div>
+  </div>
+  <div class="ops-foot">dull tactical theme · black / grey / white · anduril/palantir-inspired operations console</div>
+</div>
+""".strip()
 def _token_from_credentials_file(path: Path) -> Optional[str]:
     try:
         data = json.loads(path.read_text(encoding="utf-8"))
     return gr.Textbox(**textbox_kwargs)
+def clear_outputs() -> Tuple[str, str, str, str]:
+    return "", "Idle", "", render_ops_visual({}, "Idle", "")
 def cancel_pipeline() -> str:
     return request_cancel()
+def run_pipeline_core(
     dataset_repo_id: str,
     model_repo_id: str,
     base_model_id: str,
         finish_run()
+def run_pipeline(
+    dataset_repo_id: str,
+    model_repo_id: str,
+    base_model_id: str,
+    start_stage: int,
+    max_stages: int,
+    run_eval: bool,
+    eval_k: int,
+    eval_samples: int,
+    enforce_quality_gate: bool,
+    gate_min_pass_at_1: float,
+    gate_min_pass_at_k: float,
+    gate_min_rows: int,
+    push_to_hub: bool,
+    force_redownload: bool,
+    preflight_only: bool,
+) -> Generator[Tuple[str, str, str, str], None, None]:
+    pipeline = run_pipeline_core(
+        dataset_repo_id=dataset_repo_id,
+        model_repo_id=model_repo_id,
+        base_model_id=base_model_id,
+        start_stage=start_stage,
+        max_stages=max_stages,
+        run_eval=run_eval,
+        eval_k=eval_k,
+        eval_samples=eval_samples,
+        enforce_quality_gate=enforce_quality_gate,
+        gate_min_pass_at_1=gate_min_pass_at_1,
+        gate_min_pass_at_k=gate_min_pass_at_k,
+        gate_min_rows=gate_min_rows,
+        push_to_hub=push_to_hub,
+        force_redownload=force_redownload,
+        preflight_only=preflight_only,
+    )
+    for logs_text, status_text, summary_json in pipeline:
+        summary = _parse_summary_json(summary_json)
+        yield logs_text, status_text, summary_json, render_ops_visual(summary, status_text, logs_text)
 with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
     gr.HTML(TACTICAL_HEADER_HTML)
     gr.Markdown(PROJECT_DESCRIPTION)
         stop_button = gr.Button("Abort Active Run", variant="stop")
         clear_button = gr.Button("Reset Console")
+    ops_visual = gr.HTML(value=render_ops_visual({}, "Idle", ""))
     status = gr.Textbox(label="Run Status", value="Idle", interactive=False)
     logs = make_copyable_textbox(label="Telemetry Log", lines=24, max_lines=30, interactive=False)
     run_summary = make_copyable_textbox(
             force_redownload,
             preflight_only,
         ],
+        outputs=[logs, status, run_summary, ops_visual],
     )
     stop_button.click(fn=cancel_pipeline, inputs=None, outputs=[status], queue=False)
+    clear_button.click(fn=clear_outputs, inputs=None, outputs=[logs, status, run_summary, ops_visual], queue=False)
 if __name__ == "__main__":