"""Inject a checkpoint save into a running training process via py-spy/gdb. Simpler alternative: this script doesn't actually inject — it just waits for the next natural ckpt and reports it. For an immediate snapshot, the trainer needs SIGUSR1 handling (added in a follow-up commit). Usage: python snapshot_now.py /workspace/runs/ """ from __future__ import annotations import argparse import time from pathlib import Path def main(): ap = argparse.ArgumentParser() ap.add_argument("run_dir", type=Path) ap.add_argument("--timeout", type=int, default=900) args = ap.parse_args() deadline = time.time() + args.timeout last_ckpts = set() while time.time() < deadline: ckpts = set(args.run_dir.glob("*.pt")) new = ckpts - last_ckpts if new: for c in new: print(f"[snapshot] new ckpt: {c}") return last_ckpts = ckpts time.sleep(5) print("[snapshot] timeout, no new ckpt") if __name__ == "__main__": main()