File size: 1,037 Bytes
31e2456 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | """Inject a checkpoint save into a running training process via py-spy/gdb.
Simpler alternative: this script doesn't actually inject — it just waits for
the next natural ckpt and reports it. For an immediate snapshot, the
trainer needs SIGUSR1 handling (added in a follow-up commit).
Usage: python snapshot_now.py /workspace/runs/<run_name>
"""
from __future__ import annotations
import argparse
import time
from pathlib import Path
def main():
ap = argparse.ArgumentParser()
ap.add_argument("run_dir", type=Path)
ap.add_argument("--timeout", type=int, default=900)
args = ap.parse_args()
deadline = time.time() + args.timeout
last_ckpts = set()
while time.time() < deadline:
ckpts = set(args.run_dir.glob("*.pt"))
new = ckpts - last_ckpts
if new:
for c in new:
print(f"[snapshot] new ckpt: {c}")
return
last_ckpts = ckpts
time.sleep(5)
print("[snapshot] timeout, no new ckpt")
if __name__ == "__main__":
main()
|