Spaces:
Running
Running
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import time | |
| from pathlib import Path | |
| REPO_ROOT = Path(__file__).resolve().parents[1] | |
| DEFAULT_PROGRESS = REPO_ROOT / "data" / "runtime_metrics" / "t5_rewrite_train_progress.json" | |
| def _fmt_secs(val): | |
| if val is None: | |
| return "n/a" | |
| s = int(max(0, val)) | |
| h, rem = divmod(s, 3600) | |
| m, sec = divmod(rem, 60) | |
| if h > 0: | |
| return f"{h}h {m}m {sec}s" | |
| if m > 0: | |
| return f"{m}m {sec}s" | |
| return f"{sec}s" | |
| def main() -> int: | |
| ap = argparse.ArgumentParser(description="Watch T5 training progress JSON") | |
| ap.add_argument("--progress-file", type=Path, default=DEFAULT_PROGRESS) | |
| ap.add_argument("--interval", type=float, default=5.0) | |
| args = ap.parse_args() | |
| progress_path = args.progress_file if args.progress_file.is_absolute() else (REPO_ROOT / args.progress_file).resolve() | |
| print(f"Watching: {progress_path}") | |
| print("Press Ctrl+C to stop.\n") | |
| last = None | |
| while True: | |
| if progress_path.is_file(): | |
| try: | |
| data = json.loads(progress_path.read_text(encoding="utf-8")) | |
| key = ( | |
| data.get("status"), | |
| data.get("global_step"), | |
| data.get("max_steps"), | |
| data.get("pct_complete"), | |
| data.get("updated_at_epoch_sec"), | |
| ) | |
| if key != last: | |
| pct = data.get("pct_complete") | |
| pct_text = f"{pct:.1f}%" if isinstance(pct, (int, float)) else "n/a" | |
| print( | |
| f"[{data.get('status','?')}] " | |
| f"step {data.get('global_step','?')}/{data.get('max_steps','?')} " | |
| f"({pct_text}) elapsed={_fmt_secs(data.get('elapsed_sec'))} " | |
| f"eta={_fmt_secs(data.get('eta_sec'))}" | |
| ) | |
| logs = data.get("last_log") or {} | |
| if logs: | |
| print(f" logs: {logs}") | |
| print() | |
| last = key | |
| if data.get("status") == "completed": | |
| return 0 | |
| except Exception as e: | |
| print(f"Could not parse progress file: {e}") | |
| time.sleep(max(0.5, args.interval)) | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |