Michael Rabinovich Cursor commited on
Commit ·
ed2a486
1
Parent(s): 545c818
submit: drop eval workers 4 -> 1 (OOM under parallel osmesa)
Browse files
submit.py
CHANGED
|
@@ -72,6 +72,7 @@ import sys
|
|
| 72 |
import tempfile
|
| 73 |
import threading
|
| 74 |
import zipfile
|
|
|
|
| 75 |
from datetime import datetime, timedelta, timezone
|
| 76 |
from pathlib import Path
|
| 77 |
from typing import Any
|
|
@@ -103,12 +104,14 @@ DATA_REV_SHORT_LEN = 12
|
|
| 103 |
FAILURE_REASON_MAX_CHARS = 200
|
| 104 |
EVAL_TIMEOUT_SECONDS = 15 * 60
|
| 105 |
REPORT_TIMEOUT_SECONDS = 2 * 60
|
| 106 |
-
# Per-fixture eval workers.
|
| 107 |
-
#
|
| 108 |
-
#
|
| 109 |
-
#
|
| 110 |
-
#
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
SHA256_BLOCK_SIZE = 64 * 1024
|
| 113 |
STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
|
| 114 |
SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
|
@@ -137,7 +140,9 @@ class _HubWriteError(Exception):
|
|
| 137 |
|
| 138 |
|
| 139 |
def handle_submit(
|
| 140 |
-
zip_file,
|
|
|
|
|
|
|
| 141 |
) -> None:
|
| 142 |
"""Validate a submission upload; surface progress + outcome via toasts.
|
| 143 |
|
|
@@ -148,18 +153,18 @@ def handle_submit(
|
|
| 148 |
defensively if it's called without a profile so a UI mishap
|
| 149 |
can't write an anonymous row.
|
| 150 |
|
| 151 |
-
Side-effect-only (returns ``None``):
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
been spawned.
|
| 163 |
|
| 164 |
On rejection (login-missing, form-level, validation gate, dedup,
|
| 165 |
or Hub write), a single ``gr.Error`` toast carries the message;
|
|
@@ -181,11 +186,17 @@ def handle_submit(
|
|
| 181 |
run_dir = tmp / "run"
|
| 182 |
run_dir.mkdir()
|
| 183 |
try:
|
| 184 |
-
|
| 185 |
try:
|
| 186 |
_extract_zip(zip_path, run_dir)
|
|
|
|
| 187 |
meta = _load_and_validate_meta(run_dir)
|
|
|
|
| 188 |
fixture_names = _validate_fixture_set(run_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
_validate_steps_parseable(run_dir, fixture_names)
|
| 190 |
except _ValidationError as e:
|
| 191 |
raise gr.Error(f"Submission rejected: {e}")
|
|
@@ -193,6 +204,7 @@ def handle_submit(
|
|
| 193 |
# Dedup gate: hash the raw zip bytes and reject if an existing
|
| 194 |
# row carries the same hash. Runs after validation so a clearly
|
| 195 |
# malformed upload still gets the specific validation error.
|
|
|
|
| 196 |
zip_sha256 = _compute_sha256(zip_path)
|
| 197 |
existing_id = _find_existing_submission_by_sha256(zip_sha256)
|
| 198 |
if existing_id is not None:
|
|
@@ -206,17 +218,20 @@ def handle_submit(
|
|
| 206 |
meta["submitter_name"], meta["submission_name"]
|
| 207 |
)
|
| 208 |
try:
|
|
|
|
| 209 |
blob_url = _upload_submission_zip(submission_id, zip_path)
|
| 210 |
row = _build_pending_row(
|
| 211 |
submission_id, meta, blob_url, zip_sha256,
|
| 212 |
hf_username=profile.username,
|
| 213 |
)
|
|
|
|
| 214 |
_append_pending_row(row)
|
| 215 |
except _HubWriteError as e:
|
| 216 |
raise gr.Error(f"Submission rejected: {e}")
|
| 217 |
|
| 218 |
_spawn_worker(submission_id, tmp, run_dir)
|
| 219 |
tmp = None # ownership transferred; skip cleanup below
|
|
|
|
| 220 |
gr.Info(
|
| 221 |
f"Submission {submission_id} queued for evaluation "
|
| 222 |
f"({len(fixture_names)} fixtures). Evaluation typically "
|
|
@@ -360,7 +375,16 @@ def _validate_fixture_set(unpacked: Path) -> set[str]:
|
|
| 360 |
|
| 361 |
|
| 362 |
def _validate_steps_parseable(unpacked: Path, fixture_names: set[str]) -> None:
|
| 363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
step = unpacked / name / "output.step"
|
| 365 |
if not step.is_file():
|
| 366 |
raise _ValidationError(
|
|
@@ -378,6 +402,11 @@ def _validate_steps_parseable(unpacked: Path, fixture_names: set[str]) -> None:
|
|
| 378 |
f"as STEP geometry: {e}"
|
| 379 |
) from e
|
| 380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
def _mint_submission_id(submitter_name: str, submission_name: str) -> str:
|
| 383 |
"""Build the basename used for ``submissions/<id>.zip`` and ``reports/<id>.*``."""
|
|
|
|
| 72 |
import tempfile
|
| 73 |
import threading
|
| 74 |
import zipfile
|
| 75 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 76 |
from datetime import datetime, timedelta, timezone
|
| 77 |
from pathlib import Path
|
| 78 |
from typing import Any
|
|
|
|
| 104 |
FAILURE_REASON_MAX_CHARS = 200
|
| 105 |
EVAL_TIMEOUT_SECONDS = 15 * 60
|
| 106 |
REPORT_TIMEOUT_SECONDS = 2 * 60
|
| 107 |
+
# Per-fixture eval workers. Set to "1" (sequential) because 4 parallel
|
| 108 |
+
# workers OOM-kill on the Space's cpu-upgrade tier: cadquery-ocp +
|
| 109 |
+
# manifold3d + VTK osmesa state per worker, times 4, exceeds the box's
|
| 110 |
+
# RAM and the kernel sends a SIGKILL that surfaces in the parent as
|
| 111 |
+
# concurrent.futures.process.BrokenProcessPool after ~5 min of render.
|
| 112 |
+
# Revisit once we have a hardware tier with more RAM, or once the
|
| 113 |
+
# per-worker footprint shrinks (e.g. lazier OCC imports).
|
| 114 |
+
EVAL_WORKER_COUNT = "1"
|
| 115 |
SHA256_BLOCK_SIZE = 64 * 1024
|
| 116 |
STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
|
| 117 |
SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
|
|
|
| 140 |
|
| 141 |
|
| 142 |
def handle_submit(
|
| 143 |
+
zip_file,
|
| 144 |
+
profile: gr.OAuthProfile | None,
|
| 145 |
+
progress: gr.Progress = gr.Progress(),
|
| 146 |
) -> None:
|
| 147 |
"""Validate a submission upload; surface progress + outcome via toasts.
|
| 148 |
|
|
|
|
| 153 |
defensively if it's called without a profile so a UI mishap
|
| 154 |
can't write an anonymous row.
|
| 155 |
|
| 156 |
+
Side-effect-only (returns ``None``): in-flight status drives the
|
| 157 |
+
Gradio submit button's progress tracker (``progress(fraction,
|
| 158 |
+
desc=...)``), and the final outcome lands in a ``gr.Info`` /
|
| 159 |
+
``gr.Error`` toast. Rejection paths raise ``gr.Error``, which
|
| 160 |
+
Gradio surfaces as a red toast and aborts the handler; the outer
|
| 161 |
+
``try/finally`` still runs to clean up the temp dir.
|
| 162 |
|
| 163 |
+
Progress milestones on the happy path: extract zip (0.05),
|
| 164 |
+
validate meta (0.15), check fixture set (0.30), parse STEPs
|
| 165 |
+
(0.45), dedup hash + Hub check (0.70), upload zip (0.85), queue
|
| 166 |
+
eval (0.95), queued (1.0). The single ``gr.Info("Submission <id>
|
| 167 |
+
queued ...")`` toast fires after the worker is spawned.
|
|
|
|
| 168 |
|
| 169 |
On rejection (login-missing, form-level, validation gate, dedup,
|
| 170 |
or Hub write), a single ``gr.Error`` toast carries the message;
|
|
|
|
| 186 |
run_dir = tmp / "run"
|
| 187 |
run_dir.mkdir()
|
| 188 |
try:
|
| 189 |
+
progress(0.05, desc="Extracting zip")
|
| 190 |
try:
|
| 191 |
_extract_zip(zip_path, run_dir)
|
| 192 |
+
progress(0.15, desc="Validating meta.json")
|
| 193 |
meta = _load_and_validate_meta(run_dir)
|
| 194 |
+
progress(0.30, desc="Checking fixture set")
|
| 195 |
fixture_names = _validate_fixture_set(run_dir)
|
| 196 |
+
progress(
|
| 197 |
+
0.45,
|
| 198 |
+
desc=f"Parsing {len(fixture_names)} STEP files",
|
| 199 |
+
)
|
| 200 |
_validate_steps_parseable(run_dir, fixture_names)
|
| 201 |
except _ValidationError as e:
|
| 202 |
raise gr.Error(f"Submission rejected: {e}")
|
|
|
|
| 204 |
# Dedup gate: hash the raw zip bytes and reject if an existing
|
| 205 |
# row carries the same hash. Runs after validation so a clearly
|
| 206 |
# malformed upload still gets the specific validation error.
|
| 207 |
+
progress(0.70, desc="Checking for duplicates")
|
| 208 |
zip_sha256 = _compute_sha256(zip_path)
|
| 209 |
existing_id = _find_existing_submission_by_sha256(zip_sha256)
|
| 210 |
if existing_id is not None:
|
|
|
|
| 218 |
meta["submitter_name"], meta["submission_name"]
|
| 219 |
)
|
| 220 |
try:
|
| 221 |
+
progress(0.85, desc="Uploading submission")
|
| 222 |
blob_url = _upload_submission_zip(submission_id, zip_path)
|
| 223 |
row = _build_pending_row(
|
| 224 |
submission_id, meta, blob_url, zip_sha256,
|
| 225 |
hf_username=profile.username,
|
| 226 |
)
|
| 227 |
+
progress(0.95, desc="Queuing evaluation")
|
| 228 |
_append_pending_row(row)
|
| 229 |
except _HubWriteError as e:
|
| 230 |
raise gr.Error(f"Submission rejected: {e}")
|
| 231 |
|
| 232 |
_spawn_worker(submission_id, tmp, run_dir)
|
| 233 |
tmp = None # ownership transferred; skip cleanup below
|
| 234 |
+
progress(1.0, desc="Queued")
|
| 235 |
gr.Info(
|
| 236 |
f"Submission {submission_id} queued for evaluation "
|
| 237 |
f"({len(fixture_names)} fixtures). Evaluation typically "
|
|
|
|
| 375 |
|
| 376 |
|
| 377 |
def _validate_steps_parseable(unpacked: Path, fixture_names: set[str]) -> None:
|
| 378 |
+
# Threads (not processes): OCC's parse_step releases the GIL during
|
| 379 |
+
# the C++ STEP read, and this gate doesn't touch the VTK renderer
|
| 380 |
+
# (which is the only piece in the eval pipeline that needs the
|
| 381 |
+
# ProcessPoolExecutor + spawn dance). Per-fixture I/O + OCC load is
|
| 382 |
+
# 1-5s, so fanning out a 5+ fixture set across cpu-upgrade vCPUs
|
| 383 |
+
# cuts wall time roughly linearly. ex.map raises the first child
|
| 384 |
+
# exception when its iterator is consumed, so wrapping in list()
|
| 385 |
+
# preserves the same `Fixture <name>` rejection text as the
|
| 386 |
+
# sequential loop did.
|
| 387 |
+
def _check_one_step(name: str) -> None:
|
| 388 |
step = unpacked / name / "output.step"
|
| 389 |
if not step.is_file():
|
| 390 |
raise _ValidationError(
|
|
|
|
| 402 |
f"as STEP geometry: {e}"
|
| 403 |
) from e
|
| 404 |
|
| 405 |
+
with ThreadPoolExecutor(
|
| 406 |
+
max_workers=min(8, os.cpu_count() or 1),
|
| 407 |
+
) as ex:
|
| 408 |
+
list(ex.map(_check_one_step, sorted(fixture_names)))
|
| 409 |
+
|
| 410 |
|
| 411 |
def _mint_submission_id(submitter_name: str, submission_name: str) -> str:
|
| 412 |
"""Build the basename used for ``submissions/<id>.zip`` and ``reports/<id>.*``."""
|