submit: serialize cadgenbench evaluate to dodge cpu-upgrade contention
Browse filesEVAL_WORKER_COUNT goes from "8" to "1". Removes the debug
/debug/render-bench route added in the previous commit.
Why: on the Space's cpu-upgrade tier today, a single headless-
Chromium render takes ~7s; five in parallel collapse the host and
each individual render blows past its 120s subprocess timeout.
Same renderer, same fixtures, same Playwright version run on a
laptop in ~5s per render with no slowdown under 5-way parallelism.
The slow path is HF's shared host, not our renderer.
Sequential eval (one fixture at a time, one render at a time) gives
each Chromium the whole box. Five fixtures finish in ~30-60s wall
time, far under the 15-minute outer EVAL_TIMEOUT_SECONDS budget.
Tradeoff: total eval wall time grows linearly with fixture count.
For today's five fixtures that's a few seconds slower than the
healthy-host parallel case, and a huge cliff better than a failed
eval. For the 100-fixture launch this becomes ~10 minutes per
submission, which is the limit of this approach: a real fix at
that scale needs a non-Chromium renderer (PyVista/VTK with off-
screen OpenGL) and/or moving eval off-Space to HF Jobs. Tracked
as a follow-up.
Also revert /debug/render-bench. It served its purpose (confirming
per-render is fast, contention is what kills the pipeline).
|
@@ -421,53 +421,6 @@ app.add_api_route(
|
|
| 421 |
serve_report,
|
| 422 |
methods=["GET"],
|
| 423 |
)
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
def debug_render_bench() -> dict:
|
| 427 |
-
"""One-shot render-timing probe.
|
| 428 |
-
|
| 429 |
-
Sequentially times ``render_step`` on each cadgenbench-data input
|
| 430 |
-
STEP. Reads-only; no side effects. Used to compare per-render
|
| 431 |
-
cost on the Space's container vs. a local reference, when an
|
| 432 |
-
eval has started timing out at the 120s render_step ceiling but
|
| 433 |
-
nothing on the render path changed in our code.
|
| 434 |
-
|
| 435 |
-
Run via:
|
| 436 |
-
curl -H "Authorization: Bearer $HF_TOKEN" \
|
| 437 |
-
https://<space>.hf.space/debug/render-bench
|
| 438 |
-
"""
|
| 439 |
-
import time
|
| 440 |
-
from cadgenbench.common.paths import data_inputs_dir
|
| 441 |
-
from cadgenbench.common.viewer import render_step
|
| 442 |
-
|
| 443 |
-
base = Path(data_inputs_dir())
|
| 444 |
-
results: dict = {}
|
| 445 |
-
for fixture in sorted(p for p in base.iterdir() if p.is_dir()):
|
| 446 |
-
step = fixture / "input.step"
|
| 447 |
-
if not step.exists():
|
| 448 |
-
results[fixture.name] = {"error": "no input.step"}
|
| 449 |
-
continue
|
| 450 |
-
t0 = time.perf_counter()
|
| 451 |
-
try:
|
| 452 |
-
imgs = render_step(str(step), timeout=180)
|
| 453 |
-
dt = time.perf_counter() - t0
|
| 454 |
-
results[fixture.name] = {
|
| 455 |
-
"ok": True, "seconds": round(dt, 2), "views": len(imgs),
|
| 456 |
-
}
|
| 457 |
-
except Exception as e: # noqa: BLE001 - report whatever fails
|
| 458 |
-
dt = time.perf_counter() - t0
|
| 459 |
-
results[fixture.name] = {
|
| 460 |
-
"ok": False, "seconds": round(dt, 2),
|
| 461 |
-
"error": f"{type(e).__name__}: {str(e)[:300]}",
|
| 462 |
-
}
|
| 463 |
-
return results
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
app.add_api_route(
|
| 467 |
-
"/debug/render-bench",
|
| 468 |
-
debug_render_bench,
|
| 469 |
-
methods=["GET"],
|
| 470 |
-
)
|
| 471 |
app = gr.mount_gradio_app(app, blocks, path="/")
|
| 472 |
|
| 473 |
|
|
|
|
| 421 |
serve_report,
|
| 422 |
methods=["GET"],
|
| 423 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
app = gr.mount_gradio_app(app, blocks, path="/")
|
| 425 |
|
| 426 |
|
|
@@ -103,7 +103,14 @@ DATA_REV_SHORT_LEN = 12
|
|
| 103 |
FAILURE_REASON_MAX_CHARS = 200
|
| 104 |
EVAL_TIMEOUT_SECONDS = 15 * 60
|
| 105 |
REPORT_TIMEOUT_SECONDS = 2 * 60
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
SHA256_BLOCK_SIZE = 64 * 1024
|
| 108 |
STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
|
| 109 |
SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
|
|
|
| 103 |
FAILURE_REASON_MAX_CHARS = 200
|
| 104 |
EVAL_TIMEOUT_SECONDS = 15 * 60
|
| 105 |
REPORT_TIMEOUT_SECONDS = 2 * 60
|
| 106 |
+
# Per-fixture eval workers. Was "8" (one Python worker per fixture,
|
| 107 |
+
# each spawning its own headless-Chromium render subprocess in
|
| 108 |
+
# parallel). Concurrent rendering on the Space's cpu-upgrade tier
|
| 109 |
+
# oversubscribes the host: 5 simultaneous Chromiums turn 7s renders
|
| 110 |
+
# into 120s+ timeouts. Sequential ("1") gives each render the box
|
| 111 |
+
# to itself; 5 fixtures finish in ~30-60s wall time. Tracked as a
|
| 112 |
+
# follow-up to move off Chromium-based rendering for scale.
|
| 113 |
+
EVAL_WORKER_COUNT = "1"
|
| 114 |
SHA256_BLOCK_SIZE = 64 * 1024
|
| 115 |
STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
|
| 116 |
SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|