Michael Rabinovich Cursor commited on
Commit
ed2a486
·
1 Parent(s): 545c818

submit: drop eval workers 4 -> 1 (OOM under parallel osmesa)

Browse files
Files changed (1) hide show
  1. submit.py +49 -20
submit.py CHANGED
@@ -72,6 +72,7 @@ import sys
72
  import tempfile
73
  import threading
74
  import zipfile
 
75
  from datetime import datetime, timedelta, timezone
76
  from pathlib import Path
77
  from typing import Any
@@ -103,12 +104,14 @@ DATA_REV_SHORT_LEN = 12
103
  FAILURE_REASON_MAX_CHARS = 200
104
  EVAL_TIMEOUT_SECONDS = 15 * 60
105
  REPORT_TIMEOUT_SECONDS = 2 * 60
106
- # Per-fixture eval workers. The in-process VTK renderer (PyVista off-
107
- # screen via Mesa software EGL on the Space's cpu-upgrade tier) is
108
- # cheap enough that one ProcessPoolExecutor worker per fixture no
109
- # longer oversubscribes the host the way the old Chromium subprocess
110
- # did. "4" matches the cpu-upgrade vCPU count.
111
- EVAL_WORKER_COUNT = "4"
 
 
112
  SHA256_BLOCK_SIZE = 64 * 1024
113
  STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
114
  SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
@@ -137,7 +140,9 @@ class _HubWriteError(Exception):
137
 
138
 
139
  def handle_submit(
140
- zip_file, profile: gr.OAuthProfile | None,
 
 
141
  ) -> None:
142
  """Validate a submission upload; surface progress + outcome via toasts.
143
 
@@ -148,18 +153,18 @@ def handle_submit(
148
  defensively if it's called without a profile so a UI mishap
149
  can't write an anonymous row.
150
 
151
- Side-effect-only (returns ``None``): every status update lands in
152
- a ``gr.Info`` / ``gr.Error`` toast instead of a static markdown
153
- output below the button. Rejection paths raise ``gr.Error``,
154
- which Gradio surfaces as a red toast and aborts the handler; the
155
- outer ``try/finally`` still runs to clean up the temp dir.
 
156
 
157
- Toast sequence on the happy path:
158
- 1. ``gr.Info("Validating submission...")`` once the form-level
159
- check is past.
160
- 2. ``gr.Info("Submission <id> queued for evaluation...")`` once
161
- the pending row + zip are on the Hub and the worker has
162
- been spawned.
163
 
164
  On rejection (login-missing, form-level, validation gate, dedup,
165
  or Hub write), a single ``gr.Error`` toast carries the message;
@@ -181,11 +186,17 @@ def handle_submit(
181
  run_dir = tmp / "run"
182
  run_dir.mkdir()
183
  try:
184
- gr.Info("Validating submission...")
185
  try:
186
  _extract_zip(zip_path, run_dir)
 
187
  meta = _load_and_validate_meta(run_dir)
 
188
  fixture_names = _validate_fixture_set(run_dir)
 
 
 
 
189
  _validate_steps_parseable(run_dir, fixture_names)
190
  except _ValidationError as e:
191
  raise gr.Error(f"Submission rejected: {e}")
@@ -193,6 +204,7 @@ def handle_submit(
193
  # Dedup gate: hash the raw zip bytes and reject if an existing
194
  # row carries the same hash. Runs after validation so a clearly
195
  # malformed upload still gets the specific validation error.
 
196
  zip_sha256 = _compute_sha256(zip_path)
197
  existing_id = _find_existing_submission_by_sha256(zip_sha256)
198
  if existing_id is not None:
@@ -206,17 +218,20 @@ def handle_submit(
206
  meta["submitter_name"], meta["submission_name"]
207
  )
208
  try:
 
209
  blob_url = _upload_submission_zip(submission_id, zip_path)
210
  row = _build_pending_row(
211
  submission_id, meta, blob_url, zip_sha256,
212
  hf_username=profile.username,
213
  )
 
214
  _append_pending_row(row)
215
  except _HubWriteError as e:
216
  raise gr.Error(f"Submission rejected: {e}")
217
 
218
  _spawn_worker(submission_id, tmp, run_dir)
219
  tmp = None # ownership transferred; skip cleanup below
 
220
  gr.Info(
221
  f"Submission {submission_id} queued for evaluation "
222
  f"({len(fixture_names)} fixtures). Evaluation typically "
@@ -360,7 +375,16 @@ def _validate_fixture_set(unpacked: Path) -> set[str]:
360
 
361
 
362
  def _validate_steps_parseable(unpacked: Path, fixture_names: set[str]) -> None:
363
- for name in sorted(fixture_names):
 
 
 
 
 
 
 
 
 
364
  step = unpacked / name / "output.step"
365
  if not step.is_file():
366
  raise _ValidationError(
@@ -378,6 +402,11 @@ def _validate_steps_parseable(unpacked: Path, fixture_names: set[str]) -> None:
378
  f"as STEP geometry: {e}"
379
  ) from e
380
 
 
 
 
 
 
381
 
382
  def _mint_submission_id(submitter_name: str, submission_name: str) -> str:
383
  """Build the basename used for ``submissions/<id>.zip`` and ``reports/<id>.*``."""
 
72
  import tempfile
73
  import threading
74
  import zipfile
75
+ from concurrent.futures import ThreadPoolExecutor
76
  from datetime import datetime, timedelta, timezone
77
  from pathlib import Path
78
  from typing import Any
 
104
  FAILURE_REASON_MAX_CHARS = 200
105
  EVAL_TIMEOUT_SECONDS = 15 * 60
106
  REPORT_TIMEOUT_SECONDS = 2 * 60
107
+ # Per-fixture eval workers. Set to "1" (sequential) because 4 parallel
108
+ # workers OOM-kill on the Space's cpu-upgrade tier: cadquery-ocp +
109
+ # manifold3d + VTK osmesa state per worker, times 4, exceeds the box's
110
+ # RAM and the kernel sends a SIGKILL that surfaces in the parent as
111
+ # concurrent.futures.process.BrokenProcessPool after ~5 min of render.
112
+ # Revisit once we have a hardware tier with more RAM, or once the
113
+ # per-worker footprint shrinks (e.g. lazier OCC imports).
114
+ EVAL_WORKER_COUNT = "1"
115
  SHA256_BLOCK_SIZE = 64 * 1024
116
  STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
117
  SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 
140
 
141
 
142
  def handle_submit(
143
+ zip_file,
144
+ profile: gr.OAuthProfile | None,
145
+ progress: gr.Progress = gr.Progress(),
146
  ) -> None:
147
  """Validate a submission upload; surface progress + outcome via toasts.
148
 
 
153
  defensively if it's called without a profile so a UI mishap
154
  can't write an anonymous row.
155
 
156
+ Side-effect-only (returns ``None``): in-flight status drives the
157
+ Gradio submit button's progress tracker (``progress(fraction,
158
+ desc=...)``), and the final outcome lands in a ``gr.Info`` /
159
+ ``gr.Error`` toast. Rejection paths raise ``gr.Error``, which
160
+ Gradio surfaces as a red toast and aborts the handler; the outer
161
+ ``try/finally`` still runs to clean up the temp dir.
162
 
163
+ Progress milestones on the happy path: extract zip (0.05),
164
+ validate meta (0.15), check fixture set (0.30), parse STEPs
165
+ (0.45), dedup hash + Hub check (0.70), upload zip (0.85), queue
166
+ eval (0.95), queued (1.0). The single ``gr.Info("Submission <id>
167
+ queued ...")`` toast fires after the worker is spawned.
 
168
 
169
  On rejection (login-missing, form-level, validation gate, dedup,
170
  or Hub write), a single ``gr.Error`` toast carries the message;
 
186
  run_dir = tmp / "run"
187
  run_dir.mkdir()
188
  try:
189
+ progress(0.05, desc="Extracting zip")
190
  try:
191
  _extract_zip(zip_path, run_dir)
192
+ progress(0.15, desc="Validating meta.json")
193
  meta = _load_and_validate_meta(run_dir)
194
+ progress(0.30, desc="Checking fixture set")
195
  fixture_names = _validate_fixture_set(run_dir)
196
+ progress(
197
+ 0.45,
198
+ desc=f"Parsing {len(fixture_names)} STEP files",
199
+ )
200
  _validate_steps_parseable(run_dir, fixture_names)
201
  except _ValidationError as e:
202
  raise gr.Error(f"Submission rejected: {e}")
 
204
  # Dedup gate: hash the raw zip bytes and reject if an existing
205
  # row carries the same hash. Runs after validation so a clearly
206
  # malformed upload still gets the specific validation error.
207
+ progress(0.70, desc="Checking for duplicates")
208
  zip_sha256 = _compute_sha256(zip_path)
209
  existing_id = _find_existing_submission_by_sha256(zip_sha256)
210
  if existing_id is not None:
 
218
  meta["submitter_name"], meta["submission_name"]
219
  )
220
  try:
221
+ progress(0.85, desc="Uploading submission")
222
  blob_url = _upload_submission_zip(submission_id, zip_path)
223
  row = _build_pending_row(
224
  submission_id, meta, blob_url, zip_sha256,
225
  hf_username=profile.username,
226
  )
227
+ progress(0.95, desc="Queuing evaluation")
228
  _append_pending_row(row)
229
  except _HubWriteError as e:
230
  raise gr.Error(f"Submission rejected: {e}")
231
 
232
  _spawn_worker(submission_id, tmp, run_dir)
233
  tmp = None # ownership transferred; skip cleanup below
234
+ progress(1.0, desc="Queued")
235
  gr.Info(
236
  f"Submission {submission_id} queued for evaluation "
237
  f"({len(fixture_names)} fixtures). Evaluation typically "
 
375
 
376
 
377
  def _validate_steps_parseable(unpacked: Path, fixture_names: set[str]) -> None:
378
+ # Threads (not processes): OCC's parse_step releases the GIL during
379
+ # the C++ STEP read, and this gate doesn't touch the VTK renderer
380
+ # (which is the only piece in the eval pipeline that needs the
381
+ # ProcessPoolExecutor + spawn dance). Per-fixture I/O + OCC load is
382
+ # 1-5s, so fanning out a 5+ fixture set across cpu-upgrade vCPUs
383
+ # cuts wall time roughly linearly. ex.map raises the first child
384
+ # exception when its iterator is consumed, so wrapping in list()
385
+ # preserves the same `Fixture <name>` rejection text as the
386
+ # sequential loop did.
387
+ def _check_one_step(name: str) -> None:
388
  step = unpacked / name / "output.step"
389
  if not step.is_file():
390
  raise _ValidationError(
 
402
  f"as STEP geometry: {e}"
403
  ) from e
404
 
405
+ with ThreadPoolExecutor(
406
+ max_workers=min(8, os.cpu_count() or 1),
407
+ ) as ex:
408
+ list(ex.map(_check_one_step, sorted(fixture_names)))
409
+
410
 
411
  def _mint_submission_id(submitter_name: str, submission_name: str) -> str:
412
  """Build the basename used for ``submissions/<id>.zip`` and ``reports/<id>.*``."""