Spaces:
Running
Running
File size: 2,390 Bytes
34c53b5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | from __future__ import annotations
import argparse
import json
import time
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_PROGRESS = REPO_ROOT / "data" / "runtime_metrics" / "t5_rewrite_train_progress.json"
def _fmt_secs(val):
if val is None:
return "n/a"
s = int(max(0, val))
h, rem = divmod(s, 3600)
m, sec = divmod(rem, 60)
if h > 0:
return f"{h}h {m}m {sec}s"
if m > 0:
return f"{m}m {sec}s"
return f"{sec}s"
def main() -> int:
ap = argparse.ArgumentParser(description="Watch T5 training progress JSON")
ap.add_argument("--progress-file", type=Path, default=DEFAULT_PROGRESS)
ap.add_argument("--interval", type=float, default=5.0)
args = ap.parse_args()
progress_path = args.progress_file if args.progress_file.is_absolute() else (REPO_ROOT / args.progress_file).resolve()
print(f"Watching: {progress_path}")
print("Press Ctrl+C to stop.\n")
last = None
while True:
if progress_path.is_file():
try:
data = json.loads(progress_path.read_text(encoding="utf-8"))
key = (
data.get("status"),
data.get("global_step"),
data.get("max_steps"),
data.get("pct_complete"),
data.get("updated_at_epoch_sec"),
)
if key != last:
pct = data.get("pct_complete")
pct_text = f"{pct:.1f}%" if isinstance(pct, (int, float)) else "n/a"
print(
f"[{data.get('status','?')}] "
f"step {data.get('global_step','?')}/{data.get('max_steps','?')} "
f"({pct_text}) elapsed={_fmt_secs(data.get('elapsed_sec'))} "
f"eta={_fmt_secs(data.get('eta_sec'))}"
)
logs = data.get("last_log") or {}
if logs:
print(f" logs: {logs}")
print()
last = key
if data.get("status") == "completed":
return 0
except Exception as e:
print(f"Could not parse progress file: {e}")
time.sleep(max(0.5, args.interval))
if __name__ == "__main__":
raise SystemExit(main())
|