| """Inject a checkpoint save into a running training process via py-spy/gdb. | |
| Simpler alternative: this script doesn't actually inject — it just waits for | |
| the next natural ckpt and reports it. For an immediate snapshot, the | |
| trainer needs SIGUSR1 handling (added in a follow-up commit). | |
| Usage: python snapshot_now.py /workspace/runs/<run_name> | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import time | |
| from pathlib import Path | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("run_dir", type=Path) | |
| ap.add_argument("--timeout", type=int, default=900) | |
| args = ap.parse_args() | |
| deadline = time.time() + args.timeout | |
| last_ckpts = set() | |
| while time.time() < deadline: | |
| ckpts = set(args.run_dir.glob("*.pt")) | |
| new = ckpts - last_ckpts | |
| if new: | |
| for c in new: | |
| print(f"[snapshot] new ckpt: {c}") | |
| return | |
| last_ckpts = ckpts | |
| time.sleep(5) | |
| print("[snapshot] timeout, no new ckpt") | |
| if __name__ == "__main__": | |
| main() | |