File size: 1,037 Bytes
31e2456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
"""Inject a checkpoint save into a running training process via py-spy/gdb.

Simpler alternative: this script doesn't actually inject — it just waits for
the next natural ckpt and reports it. For an immediate snapshot, the
trainer needs SIGUSR1 handling (added in a follow-up commit).

Usage: python snapshot_now.py /workspace/runs/<run_name>
"""
from __future__ import annotations

import argparse
import time
from pathlib import Path


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("run_dir", type=Path)
    ap.add_argument("--timeout", type=int, default=900)
    args = ap.parse_args()
    deadline = time.time() + args.timeout
    last_ckpts = set()
    while time.time() < deadline:
        ckpts = set(args.run_dir.glob("*.pt"))
        new = ckpts - last_ckpts
        if new:
            for c in new:
                print(f"[snapshot] new ckpt: {c}")
            return
        last_ckpts = ckpts
        time.sleep(5)
    print("[snapshot] timeout, no new ckpt")


if __name__ == "__main__":
    main()