NorthernTribe-Research commited on
Commit
41208f3
·
verified ·
1 Parent(s): 4647b37

Harden live terminal logging and production runtime reliability (heartbeat + download retries).

Browse files
Files changed (2) hide show
  1. README.md +1 -0
  2. app.py +35 -10
README.md CHANGED
@@ -43,6 +43,7 @@ If no token is available, public dataset training still works and push is automa
43
  - `Enforce Quality Gate`: enables/disables promotion gate checks.
44
  - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
45
  - `Live Tactical Telemetry`: real-time stage progression, runtime posture, loss sparkline, and gate/push state.
 
46
  - `Preflight Only (No Training)`: validates pipeline with `--dry-run`.
47
  - `Force Dataset Redownload`: bypasses cached parquet files.
48
  - `Abort Active Run`: cancels active subprocess tree.
 
43
  - `Enforce Quality Gate`: enables/disables promotion gate checks.
44
  - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
45
  - `Live Tactical Telemetry`: real-time stage progression, runtime posture, loss sparkline, and gate/push state.
46
+ - `Runtime Terminal Log (Live)`: line-by-line subprocess stream with heartbeat events during silent phases.
47
  - `Preflight Only (No Training)`: validates pipeline with `--dry-run`.
48
  - `Force Dataset Redownload`: bypasses cached parquet files.
49
  - `Abort Active Run`: cancels active subprocess tree.
app.py CHANGED
@@ -15,6 +15,7 @@ import signal
15
  import subprocess
16
  import sys
17
  import threading
 
18
  from pathlib import Path
19
  from typing import Any, Dict, Generator, List, Optional, Tuple
20
 
@@ -721,15 +722,30 @@ def download_dataset(
721
  out_files[split] = str(out_path)
722
  continue
723
 
724
- cached_path = hf_hub_download(
725
- repo_id=dataset_repo_id,
726
- repo_type="dataset",
727
- filename=f"{split}.parquet",
728
- token=token,
729
- force_download=force_redownload,
730
- )
731
- shutil.copy2(cached_path, out_path)
732
- append_log(log_lines, f"Downloaded {split}.parquet to {out_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
  out_files[split] = str(out_path)
734
  return out_files["train"], out_files["validation"], out_files["test"]
735
 
@@ -799,6 +815,8 @@ def stream_subprocess(
799
  set_active_process(proc)
800
  cancelled = False
801
  ret = 1
 
 
802
  assert proc.stdout is not None
803
  try:
804
  while True:
@@ -819,7 +837,14 @@ def stream_subprocess(
819
  line = line.rstrip()
820
  if line:
821
  append_log(log_lines, line)
 
822
  yield "\n".join(log_lines), f"{status_prefix}: running"
 
 
 
 
 
 
823
 
824
  if proc.poll() is not None:
825
  break
@@ -1330,7 +1355,7 @@ with gr.Blocks(title="Math Conjecture Trainer Space") as demo:
1330
 
1331
  ops_visual = gr.HTML(value=render_ops_visual({}, "Idle", ""))
1332
  status = gr.Textbox(label="Run Status", value="Idle", interactive=False)
1333
- logs = make_copyable_textbox(label="Telemetry Log", lines=24, max_lines=30, interactive=False)
1334
  run_summary = make_copyable_textbox(
1335
  label="Mission Summary (JSON)",
1336
  lines=16,
 
15
  import subprocess
16
  import sys
17
  import threading
18
+ import time
19
  from pathlib import Path
20
  from typing import Any, Dict, Generator, List, Optional, Tuple
21
 
 
722
  out_files[split] = str(out_path)
723
  continue
724
 
725
+ download_attempts = 3
726
+ for attempt in range(1, download_attempts + 1):
727
+ try:
728
+ cached_path = hf_hub_download(
729
+ repo_id=dataset_repo_id,
730
+ repo_type="dataset",
731
+ filename=f"{split}.parquet",
732
+ token=token,
733
+ force_download=force_redownload,
734
+ )
735
+ shutil.copy2(cached_path, out_path)
736
+ append_log(log_lines, f"Downloaded {split}.parquet to {out_path}")
737
+ break
738
+ except Exception as exc:
739
+ if attempt >= download_attempts:
740
+ raise RuntimeError(
741
+ f"Failed downloading {split}.parquet after {download_attempts} attempts: {exc}"
742
+ ) from exc
743
+ wait_s = 2 ** attempt
744
+ append_log(
745
+ log_lines,
746
+ f"Download retry {attempt}/{download_attempts - 1} for {split}.parquet after error: {exc}",
747
+ )
748
+ time.sleep(wait_s)
749
  out_files[split] = str(out_path)
750
  return out_files["train"], out_files["validation"], out_files["test"]
751
 
 
815
  set_active_process(proc)
816
  cancelled = False
817
  ret = 1
818
+ last_heartbeat = time.monotonic()
819
+ heartbeat_interval_s = 20.0
820
  assert proc.stdout is not None
821
  try:
822
  while True:
 
837
  line = line.rstrip()
838
  if line:
839
  append_log(log_lines, line)
840
+ last_heartbeat = time.monotonic()
841
  yield "\n".join(log_lines), f"{status_prefix}: running"
842
+ elif proc.poll() is None:
843
+ now = time.monotonic()
844
+ if now - last_heartbeat >= heartbeat_interval_s:
845
+ append_log(log_lines, f"{status_prefix} heartbeat: process alive, waiting for next log chunk.")
846
+ last_heartbeat = now
847
+ yield "\n".join(log_lines), f"{status_prefix}: running"
848
 
849
  if proc.poll() is not None:
850
  break
 
1355
 
1356
  ops_visual = gr.HTML(value=render_ops_visual({}, "Idle", ""))
1357
  status = gr.Textbox(label="Run Status", value="Idle", interactive=False)
1358
+ logs = make_copyable_textbox(label="Runtime Terminal Log (Live)", lines=24, max_lines=30, interactive=False)
1359
  run_summary = make_copyable_textbox(
1360
  label="Mission Summary (JSON)",
1361
  lines=16,