File size: 2,390 Bytes
34c53b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from __future__ import annotations

import argparse
import json
import time
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_PROGRESS = REPO_ROOT / "data" / "runtime_metrics" / "t5_rewrite_train_progress.json"


def _fmt_secs(val):
    if val is None:
        return "n/a"
    s = int(max(0, val))
    h, rem = divmod(s, 3600)
    m, sec = divmod(rem, 60)
    if h > 0:
        return f"{h}h {m}m {sec}s"
    if m > 0:
        return f"{m}m {sec}s"
    return f"{sec}s"


def main() -> int:
    ap = argparse.ArgumentParser(description="Watch T5 training progress JSON")
    ap.add_argument("--progress-file", type=Path, default=DEFAULT_PROGRESS)
    ap.add_argument("--interval", type=float, default=5.0)
    args = ap.parse_args()

    progress_path = args.progress_file if args.progress_file.is_absolute() else (REPO_ROOT / args.progress_file).resolve()
    print(f"Watching: {progress_path}")
    print("Press Ctrl+C to stop.\n")
    last = None

    while True:
        if progress_path.is_file():
            try:
                data = json.loads(progress_path.read_text(encoding="utf-8"))
                key = (
                    data.get("status"),
                    data.get("global_step"),
                    data.get("max_steps"),
                    data.get("pct_complete"),
                    data.get("updated_at_epoch_sec"),
                )
                if key != last:
                    pct = data.get("pct_complete")
                    pct_text = f"{pct:.1f}%" if isinstance(pct, (int, float)) else "n/a"
                    print(
                        f"[{data.get('status','?')}] "
                        f"step {data.get('global_step','?')}/{data.get('max_steps','?')} "
                        f"({pct_text}) elapsed={_fmt_secs(data.get('elapsed_sec'))} "
                        f"eta={_fmt_secs(data.get('eta_sec'))}"
                    )
                    logs = data.get("last_log") or {}
                    if logs:
                        print(f"  logs: {logs}")
                    print()
                    last = key
                if data.get("status") == "completed":
                    return 0
            except Exception as e:
                print(f"Could not parse progress file: {e}")
        time.sleep(max(0.5, args.interval))


if __name__ == "__main__":
    raise SystemExit(main())