Michael Rabinovich commited on
Commit
8762996
·
1 Parent(s): b1601ec

space: add rebuild and probe helpers

Browse files
Files changed (2) hide show
  1. _probe_job.py +63 -0
  2. _rebuild_space.py +69 -0
_probe_job.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run a tiny cpu-basic Job on the eval-gpu image to verify it has the fix.
2
+
3
+ Reads HF_TOKEN (jobs-hf PAT, job.write on michaelr27) from ../cadgenbench/.env.
4
+ Prints the job's HAS_FIX line and exit status.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import sys
9
+ import time
10
+ from pathlib import Path
11
+
12
+ IMAGE = "hf.co/spaces/HuggingAI4Engineering/cadgenbench-eval-gpu"
13
+ NAMESPACE = "michaelr27"
14
+
15
+ PROBE = (
16
+ "import pathlib, cadgenbench; "
17
+ "b = pathlib.Path(cadgenbench.__file__).parent / 'baseline'; "
18
+ "cl = (b / 'compare_llms.py').read_text(); "
19
+ "ag = (b / 'agent.py').read_text(); "
20
+ "print('HAS_FIX', "
21
+ "('_model_pool_backstop_s' in cl) and ('_terminate_pool_workers' in cl) "
22
+ "and ('_shutdown_render_pool()' in ag)); "
23
+ "print('HAS_PACKAGE', (b / 'package.py').is_file()); "
24
+ "import importlib; importlib.import_module('cadgenbench.common.baseline_models'); "
25
+ "print('HAS_BASELINE_MODELS', True)"
26
+ )
27
+
28
+
29
+ def main() -> int:
30
+ from huggingface_hub import HfApi
31
+
32
+ # Use the cached login token (jobs-hf PAT, job.write on michaelr27); the
33
+ # .env HF_TOKEN is the job's *secret* for reading GT, not the submitter.
34
+ api = HfApi()
35
+ job = api.run_job(
36
+ image=IMAGE,
37
+ command=["python", "-c", PROBE],
38
+ flavor="cpu-basic",
39
+ namespace=NAMESPACE,
40
+ timeout=600,
41
+ )
42
+ print(f"[probe] job_id={job.id} status={job.status}", flush=True)
43
+
44
+ t0 = time.monotonic()
45
+ while time.monotonic() - t0 < 600:
46
+ info = api.inspect_job(job_id=job.id, namespace=NAMESPACE)
47
+ stage = getattr(info.status, "stage", info.status)
48
+ print(f"[probe] {time.monotonic()-t0:5.0f}s stage={stage}", flush=True)
49
+ if str(stage) in {"COMPLETED", "ERROR", "DELETED"}:
50
+ break
51
+ time.sleep(8)
52
+
53
+ print("[probe] ---- logs ----", flush=True)
54
+ try:
55
+ for line in api.fetch_job_logs(job_id=job.id, namespace=NAMESPACE):
56
+ print(line, flush=True)
57
+ except Exception as e: # noqa: BLE001
58
+ print(f"[probe] log fetch error: {e}", flush=True)
59
+ return 0
60
+
61
+
62
+ if __name__ == "__main__":
63
+ sys.exit(main())
_rebuild_space.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Trigger a rebuild of the (paused) eval-gpu Space image and poll status.
2
+
3
+ Usage: python _rebuild_space.py [--pause-when-done]
4
+ Reads HF_TOKEN from ../cadgenbench/.env.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import os
10
+ import sys
11
+ import time
12
+ from pathlib import Path
13
+
14
+ SID = "HuggingAI4Engineering/cadgenbench-eval-gpu"
15
+
16
+
17
+ def _load_token() -> str:
18
+ # Prefer an explicitly-provided token (e.g. pulled from git's credential
19
+ # helper, which has write on the Space) over the .env jobs-hf PAT, which
20
+ # is read-only on this Space and 403s on restart_space.
21
+ explicit = os.environ.get("HF_REBUILD_TOKEN")
22
+ if explicit:
23
+ return explicit.strip()
24
+ env = Path(__file__).resolve().parent.parent / "cadgenbench" / ".env"
25
+ for line in env.read_text().splitlines():
26
+ if line.startswith("HF_TOKEN="):
27
+ return line.split("=", 1)[1].strip()
28
+ raise SystemExit("HF_TOKEN not found in .env")
29
+
30
+
31
+ def main() -> int:
32
+ ap = argparse.ArgumentParser()
33
+ ap.add_argument("--pause-when-done", action="store_true")
34
+ ap.add_argument("--timeout", type=float, default=900.0)
35
+ args = ap.parse_args()
36
+
37
+ from huggingface_hub import HfApi
38
+
39
+ api = HfApi(token=_load_token())
40
+ print(f"[rebuild] restart_space({SID})", flush=True)
41
+ api.restart_space(SID)
42
+
43
+ t0 = time.monotonic()
44
+ last = None
45
+ terminal_ok = {"RUNNING", "RUNNING_APP_STARTING", "APP_STARTING"}
46
+ terminal_bad = {"BUILD_ERROR", "RUNTIME_ERROR", "CONFIG_ERROR"}
47
+ while time.monotonic() - t0 < args.timeout:
48
+ rt = api.get_space_runtime(SID)
49
+ stage = rt.stage
50
+ if stage != last:
51
+ print(f"[rebuild] {time.monotonic() - t0:6.0f}s stage={stage}", flush=True)
52
+ last = stage
53
+ if stage in terminal_ok:
54
+ print(f"[rebuild] BUILD OK (stage={stage})", flush=True)
55
+ if args.pause_when_done:
56
+ print("[rebuild] pausing space", flush=True)
57
+ api.pause_space(SID)
58
+ print("[rebuild] paused", flush=True)
59
+ return 0
60
+ if stage in terminal_bad:
61
+ print(f"[rebuild] BUILD FAILED (stage={stage})", flush=True)
62
+ return 1
63
+ time.sleep(15)
64
+ print("[rebuild] TIMEOUT waiting for build", flush=True)
65
+ return 2
66
+
67
+
68
+ if __name__ == "__main__":
69
+ sys.exit(main())