PhysioJEPA / scripts /snapshot_now.py
guychuk's picture
Upload folder using huggingface_hub
31e2456 verified
"""Inject a checkpoint save into a running training process via py-spy/gdb.
Simpler alternative: this script doesn't actually inject — it just waits for
the next natural ckpt and reports it. For an immediate snapshot, the
trainer needs SIGUSR1 handling (added in a follow-up commit).
Usage: python snapshot_now.py /workspace/runs/<run_name>
"""
from __future__ import annotations
import argparse
import time
from pathlib import Path
def main():
ap = argparse.ArgumentParser()
ap.add_argument("run_dir", type=Path)
ap.add_argument("--timeout", type=int, default=900)
args = ap.parse_args()
deadline = time.time() + args.timeout
last_ckpts = set()
while time.time() < deadline:
ckpts = set(args.run_dir.glob("*.pt"))
new = ckpts - last_ckpts
if new:
for c in new:
print(f"[snapshot] new ckpt: {c}")
return
last_ckpts = ckpts
time.sleep(5)
print("[snapshot] timeout, no new ckpt")
if __name__ == "__main__":
main()