Spaces:
Runtime error
Runtime error
Michael Rabinovich commited on
Commit ·
8762996
1
Parent(s): b1601ec
space: add rebuild and probe helpers
Browse files- _probe_job.py +63 -0
- _rebuild_space.py +69 -0
_probe_job.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run a tiny cpu-basic Job on the eval-gpu image to verify it has the fix.
|
| 2 |
+
|
| 3 |
+
Reads HF_TOKEN (jobs-hf PAT, job.write on michaelr27) from ../cadgenbench/.env.
|
| 4 |
+
Prints the job's HAS_FIX line and exit status.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
IMAGE = "hf.co/spaces/HuggingAI4Engineering/cadgenbench-eval-gpu"
|
| 13 |
+
NAMESPACE = "michaelr27"
|
| 14 |
+
|
| 15 |
+
PROBE = (
|
| 16 |
+
"import pathlib, cadgenbench; "
|
| 17 |
+
"b = pathlib.Path(cadgenbench.__file__).parent / 'baseline'; "
|
| 18 |
+
"cl = (b / 'compare_llms.py').read_text(); "
|
| 19 |
+
"ag = (b / 'agent.py').read_text(); "
|
| 20 |
+
"print('HAS_FIX', "
|
| 21 |
+
"('_model_pool_backstop_s' in cl) and ('_terminate_pool_workers' in cl) "
|
| 22 |
+
"and ('_shutdown_render_pool()' in ag)); "
|
| 23 |
+
"print('HAS_PACKAGE', (b / 'package.py').is_file()); "
|
| 24 |
+
"import importlib; importlib.import_module('cadgenbench.common.baseline_models'); "
|
| 25 |
+
"print('HAS_BASELINE_MODELS', True)"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def main() -> int:
|
| 30 |
+
from huggingface_hub import HfApi
|
| 31 |
+
|
| 32 |
+
# Use the cached login token (jobs-hf PAT, job.write on michaelr27); the
|
| 33 |
+
# .env HF_TOKEN is the job's *secret* for reading GT, not the submitter.
|
| 34 |
+
api = HfApi()
|
| 35 |
+
job = api.run_job(
|
| 36 |
+
image=IMAGE,
|
| 37 |
+
command=["python", "-c", PROBE],
|
| 38 |
+
flavor="cpu-basic",
|
| 39 |
+
namespace=NAMESPACE,
|
| 40 |
+
timeout=600,
|
| 41 |
+
)
|
| 42 |
+
print(f"[probe] job_id={job.id} status={job.status}", flush=True)
|
| 43 |
+
|
| 44 |
+
t0 = time.monotonic()
|
| 45 |
+
while time.monotonic() - t0 < 600:
|
| 46 |
+
info = api.inspect_job(job_id=job.id, namespace=NAMESPACE)
|
| 47 |
+
stage = getattr(info.status, "stage", info.status)
|
| 48 |
+
print(f"[probe] {time.monotonic()-t0:5.0f}s stage={stage}", flush=True)
|
| 49 |
+
if str(stage) in {"COMPLETED", "ERROR", "DELETED"}:
|
| 50 |
+
break
|
| 51 |
+
time.sleep(8)
|
| 52 |
+
|
| 53 |
+
print("[probe] ---- logs ----", flush=True)
|
| 54 |
+
try:
|
| 55 |
+
for line in api.fetch_job_logs(job_id=job.id, namespace=NAMESPACE):
|
| 56 |
+
print(line, flush=True)
|
| 57 |
+
except Exception as e: # noqa: BLE001
|
| 58 |
+
print(f"[probe] log fetch error: {e}", flush=True)
|
| 59 |
+
return 0
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
sys.exit(main())
|
_rebuild_space.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Trigger a rebuild of the (paused) eval-gpu Space image and poll status.
|
| 2 |
+
|
| 3 |
+
Usage: python _rebuild_space.py [--pause-when-done]
|
| 4 |
+
Reads HF_TOKEN from ../cadgenbench/.env.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import argparse
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
import time
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
SID = "HuggingAI4Engineering/cadgenbench-eval-gpu"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _load_token() -> str:
|
| 18 |
+
# Prefer an explicitly-provided token (e.g. pulled from git's credential
|
| 19 |
+
# helper, which has write on the Space) over the .env jobs-hf PAT, which
|
| 20 |
+
# is read-only on this Space and 403s on restart_space.
|
| 21 |
+
explicit = os.environ.get("HF_REBUILD_TOKEN")
|
| 22 |
+
if explicit:
|
| 23 |
+
return explicit.strip()
|
| 24 |
+
env = Path(__file__).resolve().parent.parent / "cadgenbench" / ".env"
|
| 25 |
+
for line in env.read_text().splitlines():
|
| 26 |
+
if line.startswith("HF_TOKEN="):
|
| 27 |
+
return line.split("=", 1)[1].strip()
|
| 28 |
+
raise SystemExit("HF_TOKEN not found in .env")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def main() -> int:
|
| 32 |
+
ap = argparse.ArgumentParser()
|
| 33 |
+
ap.add_argument("--pause-when-done", action="store_true")
|
| 34 |
+
ap.add_argument("--timeout", type=float, default=900.0)
|
| 35 |
+
args = ap.parse_args()
|
| 36 |
+
|
| 37 |
+
from huggingface_hub import HfApi
|
| 38 |
+
|
| 39 |
+
api = HfApi(token=_load_token())
|
| 40 |
+
print(f"[rebuild] restart_space({SID})", flush=True)
|
| 41 |
+
api.restart_space(SID)
|
| 42 |
+
|
| 43 |
+
t0 = time.monotonic()
|
| 44 |
+
last = None
|
| 45 |
+
terminal_ok = {"RUNNING", "RUNNING_APP_STARTING", "APP_STARTING"}
|
| 46 |
+
terminal_bad = {"BUILD_ERROR", "RUNTIME_ERROR", "CONFIG_ERROR"}
|
| 47 |
+
while time.monotonic() - t0 < args.timeout:
|
| 48 |
+
rt = api.get_space_runtime(SID)
|
| 49 |
+
stage = rt.stage
|
| 50 |
+
if stage != last:
|
| 51 |
+
print(f"[rebuild] {time.monotonic() - t0:6.0f}s stage={stage}", flush=True)
|
| 52 |
+
last = stage
|
| 53 |
+
if stage in terminal_ok:
|
| 54 |
+
print(f"[rebuild] BUILD OK (stage={stage})", flush=True)
|
| 55 |
+
if args.pause_when_done:
|
| 56 |
+
print("[rebuild] pausing space", flush=True)
|
| 57 |
+
api.pause_space(SID)
|
| 58 |
+
print("[rebuild] paused", flush=True)
|
| 59 |
+
return 0
|
| 60 |
+
if stage in terminal_bad:
|
| 61 |
+
print(f"[rebuild] BUILD FAILED (stage={stage})", flush=True)
|
| 62 |
+
return 1
|
| 63 |
+
time.sleep(15)
|
| 64 |
+
print("[rebuild] TIMEOUT waiting for build", flush=True)
|
| 65 |
+
return 2
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
sys.exit(main())
|