from __future__ import annotations import argparse import json import time from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] DEFAULT_PROGRESS = REPO_ROOT / "data" / "runtime_metrics" / "t5_rewrite_train_progress.json" def _fmt_secs(val): if val is None: return "n/a" s = int(max(0, val)) h, rem = divmod(s, 3600) m, sec = divmod(rem, 60) if h > 0: return f"{h}h {m}m {sec}s" if m > 0: return f"{m}m {sec}s" return f"{sec}s" def main() -> int: ap = argparse.ArgumentParser(description="Watch T5 training progress JSON") ap.add_argument("--progress-file", type=Path, default=DEFAULT_PROGRESS) ap.add_argument("--interval", type=float, default=5.0) args = ap.parse_args() progress_path = args.progress_file if args.progress_file.is_absolute() else (REPO_ROOT / args.progress_file).resolve() print(f"Watching: {progress_path}") print("Press Ctrl+C to stop.\n") last = None while True: if progress_path.is_file(): try: data = json.loads(progress_path.read_text(encoding="utf-8")) key = ( data.get("status"), data.get("global_step"), data.get("max_steps"), data.get("pct_complete"), data.get("updated_at_epoch_sec"), ) if key != last: pct = data.get("pct_complete") pct_text = f"{pct:.1f}%" if isinstance(pct, (int, float)) else "n/a" print( f"[{data.get('status','?')}] " f"step {data.get('global_step','?')}/{data.get('max_steps','?')} " f"({pct_text}) elapsed={_fmt_secs(data.get('elapsed_sec'))} " f"eta={_fmt_secs(data.get('eta_sec'))}" ) logs = data.get("last_log") or {} if logs: print(f" logs: {logs}") print() last = key if data.get("status") == "completed": return 0 except Exception as e: print(f"Could not parse progress file: {e}") time.sleep(max(0.5, args.interval)) if __name__ == "__main__": raise SystemExit(main())