Harden live terminal logging and production runtime reliability (heartbeat + download retries).
Browse files
README.md
CHANGED
|
@@ -43,6 +43,7 @@ If no token is available, public dataset training still works and push is automa
|
|
| 43 |
- `Enforce Quality Gate`: enables/disables promotion gate checks.
|
| 44 |
- `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
|
| 45 |
- `Live Tactical Telemetry`: real-time stage progression, runtime posture, loss sparkline, and gate/push state.
|
|
|
|
| 46 |
- `Preflight Only (No Training)`: validates pipeline with `--dry-run`.
|
| 47 |
- `Force Dataset Redownload`: bypasses cached parquet files.
|
| 48 |
- `Abort Active Run`: cancels active subprocess tree.
|
|
|
|
| 43 |
- `Enforce Quality Gate`: enables/disables promotion gate checks.
|
| 44 |
- `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
|
| 45 |
- `Live Tactical Telemetry`: real-time stage progression, runtime posture, loss sparkline, and gate/push state.
|
| 46 |
+
- `Runtime Terminal Log (Live)`: line-by-line subprocess stream with heartbeat events during silent phases.
|
| 47 |
- `Preflight Only (No Training)`: validates pipeline with `--dry-run`.
|
| 48 |
- `Force Dataset Redownload`: bypasses cached parquet files.
|
| 49 |
- `Abort Active Run`: cancels active subprocess tree.
|
app.py
CHANGED
|
@@ -15,6 +15,7 @@ import signal
|
|
| 15 |
import subprocess
|
| 16 |
import sys
|
| 17 |
import threading
|
|
|
|
| 18 |
from pathlib import Path
|
| 19 |
from typing import Any, Dict, Generator, List, Optional, Tuple
|
| 20 |
|
|
@@ -721,15 +722,30 @@ def download_dataset(
|
|
| 721 |
out_files[split] = str(out_path)
|
| 722 |
continue
|
| 723 |
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 733 |
out_files[split] = str(out_path)
|
| 734 |
return out_files["train"], out_files["validation"], out_files["test"]
|
| 735 |
|
|
@@ -799,6 +815,8 @@ def stream_subprocess(
|
|
| 799 |
set_active_process(proc)
|
| 800 |
cancelled = False
|
| 801 |
ret = 1
|
|
|
|
|
|
|
| 802 |
assert proc.stdout is not None
|
| 803 |
try:
|
| 804 |
while True:
|
|
@@ -819,7 +837,14 @@ def stream_subprocess(
|
|
| 819 |
line = line.rstrip()
|
| 820 |
if line:
|
| 821 |
append_log(log_lines, line)
|
|
|
|
| 822 |
yield "\n".join(log_lines), f"{status_prefix}: running"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 823 |
|
| 824 |
if proc.poll() is not None:
|
| 825 |
break
|
|
@@ -1330,7 +1355,7 @@ with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
|
|
| 1330 |
|
| 1331 |
ops_visual = gr.HTML(value=render_ops_visual({}, "Idle", ""))
|
| 1332 |
status = gr.Textbox(label="Run Status", value="Idle", interactive=False)
|
| 1333 |
-
logs = make_copyable_textbox(label="
|
| 1334 |
run_summary = make_copyable_textbox(
|
| 1335 |
label="Mission Summary (JSON)",
|
| 1336 |
lines=16,
|
|
|
|
| 15 |
import subprocess
|
| 16 |
import sys
|
| 17 |
import threading
|
| 18 |
+
import time
|
| 19 |
from pathlib import Path
|
| 20 |
from typing import Any, Dict, Generator, List, Optional, Tuple
|
| 21 |
|
|
|
|
| 722 |
out_files[split] = str(out_path)
|
| 723 |
continue
|
| 724 |
|
| 725 |
+
download_attempts = 3
|
| 726 |
+
for attempt in range(1, download_attempts + 1):
|
| 727 |
+
try:
|
| 728 |
+
cached_path = hf_hub_download(
|
| 729 |
+
repo_id=dataset_repo_id,
|
| 730 |
+
repo_type="dataset",
|
| 731 |
+
filename=f"{split}.parquet",
|
| 732 |
+
token=token,
|
| 733 |
+
force_download=force_redownload,
|
| 734 |
+
)
|
| 735 |
+
shutil.copy2(cached_path, out_path)
|
| 736 |
+
append_log(log_lines, f"Downloaded {split}.parquet to {out_path}")
|
| 737 |
+
break
|
| 738 |
+
except Exception as exc:
|
| 739 |
+
if attempt >= download_attempts:
|
| 740 |
+
raise RuntimeError(
|
| 741 |
+
f"Failed downloading {split}.parquet after {download_attempts} attempts: {exc}"
|
| 742 |
+
) from exc
|
| 743 |
+
wait_s = 2 ** attempt
|
| 744 |
+
append_log(
|
| 745 |
+
log_lines,
|
| 746 |
+
f"Download retry {attempt}/{download_attempts - 1} for {split}.parquet after error: {exc}",
|
| 747 |
+
)
|
| 748 |
+
time.sleep(wait_s)
|
| 749 |
out_files[split] = str(out_path)
|
| 750 |
return out_files["train"], out_files["validation"], out_files["test"]
|
| 751 |
|
|
|
|
| 815 |
set_active_process(proc)
|
| 816 |
cancelled = False
|
| 817 |
ret = 1
|
| 818 |
+
last_heartbeat = time.monotonic()
|
| 819 |
+
heartbeat_interval_s = 20.0
|
| 820 |
assert proc.stdout is not None
|
| 821 |
try:
|
| 822 |
while True:
|
|
|
|
| 837 |
line = line.rstrip()
|
| 838 |
if line:
|
| 839 |
append_log(log_lines, line)
|
| 840 |
+
last_heartbeat = time.monotonic()
|
| 841 |
yield "\n".join(log_lines), f"{status_prefix}: running"
|
| 842 |
+
elif proc.poll() is None:
|
| 843 |
+
now = time.monotonic()
|
| 844 |
+
if now - last_heartbeat >= heartbeat_interval_s:
|
| 845 |
+
append_log(log_lines, f"{status_prefix} heartbeat: process alive, waiting for next log chunk.")
|
| 846 |
+
last_heartbeat = now
|
| 847 |
+
yield "\n".join(log_lines), f"{status_prefix}: running"
|
| 848 |
|
| 849 |
if proc.poll() is not None:
|
| 850 |
break
|
|
|
|
| 1355 |
|
| 1356 |
ops_visual = gr.HTML(value=render_ops_visual({}, "Idle", ""))
|
| 1357 |
status = gr.Textbox(label="Run Status", value="Idle", interactive=False)
|
| 1358 |
+
logs = make_copyable_textbox(label="Runtime Terminal Log (Live)", lines=24, max_lines=30, interactive=False)
|
| 1359 |
run_summary = make_copyable_textbox(
|
| 1360 |
label="Mission Summary (JSON)",
|
| 1361 |
lines=16,
|