Michael Rabinovich Cursor commited on
Commit
b224eee
·
1 Parent(s): dc8ff2a

submit: retry Hub commits on 429/5xx + persistent status panel

Browse files

Wrap every Space-side commit (zip upload, results.jsonl RMW, report +
gallery uploads, shard cleanup) in a backoff-with-jitter retry that
honors Retry-After, so a rate-limited submissions repo delays rather
than fails a submit. Make handle_submit a generator streaming a
durable status panel (validating -> uploading -> queued, or the
rejection reason) instead of relying on transient toasts.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (3) hide show
  1. app.py +17 -5
  2. submit.py +186 -74
  3. tests/test_submit.py +82 -0
app.py CHANGED
@@ -123,6 +123,10 @@ VALIDATION_GUIDELINES_MD = f"""Submissions appear on the **Unvalidated** table t
123
  Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL})."""
124
 
125
  DETAIL_PLACEHOLDER = "_Click a row above for details._"
 
 
 
 
126
 
127
 
128
  def _has(value) -> bool:
@@ -784,11 +788,19 @@ to publish the resulting row on the public leaderboard.
784
  # Starts disabled; the `blocks.load` handler below flips it
785
  # to interactive when an OAuthProfile is present.
786
  submit_btn = gr.Button("Submit", variant="primary", interactive=False)
787
- # No static markdown output: handle_submit surfaces every
788
- # status update via gr.Info / gr.Error toasts. The handler
789
- # also reads `gr.OAuthProfile` implicitly via its parameter
790
- # type annotation (Gradio's dependency-injection convention).
791
- submit_btn.click(fn=handle_submit, inputs=[zip_in])
 
 
 
 
 
 
 
 
792
 
793
  with gr.Tab("About"):
794
  gr.Markdown(ABOUT_MD)
 
123
  Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL})."""
124
 
125
  DETAIL_PLACEHOLDER = "_Click a row above for details._"
126
+ SUBMIT_STATUS_IDLE = (
127
+ "_Log in, attach a zip, and click **Submit**. Progress and any "
128
+ "errors appear here._"
129
+ )
130
 
131
 
132
  def _has(value) -> bool:
 
788
  # Starts disabled; the `blocks.load` handler below flips it
789
  # to interactive when an OAuthProfile is present.
790
  submit_btn = gr.Button("Submit", variant="primary", interactive=False)
791
+ # Persistent status panel. handle_submit is a generator that
792
+ # streams stage updates (validating -> uploading/queuing ->
793
+ # queued) and any rejection reason here, so the outcome
794
+ # survives instead of vanishing with a transient toast. The
795
+ # handler also reads `gr.OAuthProfile` implicitly via its
796
+ # parameter type annotation (Gradio's dependency-injection
797
+ # convention).
798
+ submit_status = gr.Markdown(value=SUBMIT_STATUS_IDLE)
799
+ submit_btn.click(
800
+ fn=handle_submit,
801
+ inputs=[zip_in],
802
+ outputs=[submit_status],
803
+ )
804
 
805
  with gr.Tab("About"):
806
  gr.Markdown(ABOUT_MD)
submit.py CHANGED
@@ -88,6 +88,7 @@ import hashlib
88
  import json
89
  import logging
90
  import os
 
91
  import re
92
  import shutil
93
  import tempfile
@@ -111,7 +112,7 @@ from huggingface_hub import (
111
  run_job,
112
  snapshot_download,
113
  )
114
- from huggingface_hub.errors import EntryNotFoundError
115
 
116
  from leaderboard import HF_DATA_REPO, HF_ORG, HF_SUBMISSIONS_REPO
117
 
@@ -138,6 +139,18 @@ GALLERY_THUMB_VIEW = "iso"
138
  DATA_REV_SHORT_LEN = 12
139
  FAILURE_REASON_MAX_CHARS = 200
140
  SHA256_BLOCK_SIZE = 64 * 1024
 
 
 
 
 
 
 
 
 
 
 
 
141
  STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
142
  SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
143
  STUCK_PENDING_REASON = "evaluation interrupted by Space restart"
@@ -214,49 +227,109 @@ class _HubWriteError(Exception):
214
  """Raised when a Hub upload fails after validation succeeded."""
215
 
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  def handle_submit(
218
  zip_file,
219
  profile: gr.OAuthProfile | None,
220
- ) -> None:
221
- """Validate a submission upload; surface progress + outcome via toasts.
222
 
223
  Requires the user to be logged in via ``gr.LoginButton`` so the
224
  row's ``hf_username`` is the canonical HF identity (not a
225
  free-text claim). The submit button in :mod:`app` is disabled
226
- until login completes; this function also raises ``gr.Error``
227
- defensively if it's called without a profile so a UI mishap
228
- can't write an anonymous row.
229
-
230
- Side-effect-only (returns ``None``): in-flight status surfaces via
231
- ``gr.Info`` toasts (validating on click, queued after the worker
232
- spawns) and ``gr.Error`` toasts for rejections. Rejection paths
233
- raise ``gr.Error``, which Gradio surfaces as a red toast and
234
- aborts the handler; the outer ``try/finally`` still runs to clean
235
- up the temp dir.
236
-
237
- Toasts on the happy path:
238
-
239
- 1. ``"Validating submission..."`` on click, with a wall-time hint
240
- so the user knows the post-queue eval takes ~1 minute.
241
- 2. ``"Submission <id> queued ..."`` once the row + zip are on the
242
- Hub and the worker has been spawned.
243
-
244
- On rejection (login-missing, form-level, validation gate, dedup,
245
- or Hub write), a single ``gr.Error`` toast carries the message;
246
- no second toast.
247
  """
248
  if profile is None:
249
- raise gr.Error("Please log in via the HF button before submitting.")
 
 
250
  form_err = _validate_form(zip_file)
251
  if form_err is not None:
 
252
  raise gr.Error(form_err)
253
 
254
- # Immediate ack on click. The eval itself (after queuing) is the
255
- # slow part, so hint at the wall time up front so the user knows
256
- # what to expect instead of waiting on a silent button.
257
- gr.Info(
258
- "Validating submission... evaluation usually takes "
259
- "~1 minute after queuing."
260
  )
261
 
262
  zip_path = Path(zip_file.name)
@@ -276,7 +349,9 @@ def handle_submit(
276
  fixture_names = _validate_fixture_set(run_dir)
277
  _validate_steps_parseable(run_dir, fixture_names)
278
  except _ValidationError as e:
279
- raise gr.Error(f"Submission rejected: {e}")
 
 
280
 
281
  # Dedup gate: hash the raw zip bytes and reject if an existing
282
  # row carries the same hash. Runs after validation so a clearly
@@ -284,15 +359,23 @@ def handle_submit(
284
  zip_sha256 = _compute_sha256(zip_path)
285
  existing_id = _find_existing_submission_by_sha256(zip_sha256)
286
  if existing_id is not None:
287
- raise gr.Error(
288
  f"This zip's contents are identical to an existing "
289
  f"submission ({existing_id}). Resubmit only after changing "
290
  f"at least one byte of the upload."
291
  )
 
 
292
 
293
  submission_id = _mint_submission_id(
294
  meta["submitter_name"], meta["submission_name"]
295
  )
 
 
 
 
 
 
296
  try:
297
  blob_url = _upload_submission_zip(submission_id, zip_path)
298
  row = _build_pending_row(
@@ -301,14 +384,17 @@ def handle_submit(
301
  )
302
  _append_pending_row(row)
303
  except _HubWriteError as e:
304
- raise gr.Error(f"Submission rejected: {e}")
 
 
305
 
306
  _spawn_worker(submission_id, blob_url, sorted(fixture_names))
307
- gr.Info(
308
- f"Submission {submission_id} queued for evaluation "
309
- f"({len(fixture_names)} fixtures). The eval runs on an "
310
- f"HF Jobs GPU; the row flips to completed when the job "
311
- f"finishes (typically 1-3 minutes)."
 
312
  )
313
  finally:
314
  shutil.rmtree(tmp, ignore_errors=True)
@@ -509,12 +595,15 @@ def _upload_submission_zip(submission_id: str, zip_path: Path) -> str:
509
  """
510
  repo_path = f"{SUBMISSIONS_DIR}/{submission_id}.zip"
511
  try:
512
- _HF_API.upload_file(
513
- path_or_fileobj=str(zip_path),
514
- path_in_repo=repo_path,
515
- repo_id=HF_SUBMISSIONS_REPO,
516
- repo_type="dataset",
517
- commit_message=f"add submission zip for {submission_id}",
 
 
 
518
  )
519
  except Exception as e: # noqa: BLE001 - Hub API surface is broad
520
  logger.exception("Failed to upload submission zip %s", submission_id)
@@ -667,8 +756,14 @@ def _hub_rmw_results(
667
  The lock is held only for the read-modify-write cycle (~1-2s),
668
  never for eval time. Concurrent submitters serialise here, not
669
  in the eval pipeline. Treats a missing file as the empty list.
 
 
 
 
 
 
670
  """
671
- with _HUB_LOCK:
672
  existing = _download_results_jsonl()
673
  rows: list[dict[str, Any]] = [
674
  json.loads(line) for line in existing.splitlines() if line.strip()
@@ -687,6 +782,9 @@ def _hub_rmw_results(
687
  commit_message=commit_message,
688
  )
689
 
 
 
 
690
 
691
  def _download_results_jsonl() -> str:
692
  """Fetch the current ``results.jsonl`` body as text, or ``""`` if absent."""
@@ -1239,21 +1337,27 @@ def _upload_reports(
1239
  artifacts land at the exact paths the leaderboard + the row-flip
1240
  expect. Uses the process ``HfApi`` (Space HF_TOKEN env).
1241
  """
1242
- _HF_API.upload_file(
1243
- path_or_fileobj=str(html_path),
1244
- path_in_repo=f"{REPORTS_DIR}/{submission_id}.html",
1245
- repo_id=HF_SUBMISSIONS_REPO,
1246
- repo_type="dataset",
1247
- commit_message=f"add merged HTML report for {submission_id}",
 
 
 
1248
  )
1249
- _HF_API.upload_file(
1250
- path_or_fileobj=json.dumps(
1251
- report_json, ensure_ascii=False, indent=2,
1252
- ).encode("utf-8"),
1253
- path_in_repo=f"{REPORTS_DIR}/{submission_id}.json",
1254
- repo_id=HF_SUBMISSIONS_REPO,
1255
- repo_type="dataset",
1256
- commit_message=f"add merged JSON report for {submission_id}",
 
 
 
1257
  )
1258
  logger.info("Uploaded merged reports/%s.{html,json}", submission_id)
1259
 
@@ -1280,16 +1384,21 @@ def _upload_gallery_renders_from_dir(
1280
  logger.info("No gallery renders to upload for %s", submission_id)
1281
  return
1282
  for iso_png, fixture_name in staged:
1283
- _HF_API.upload_file(
1284
- path_or_fileobj=str(iso_png),
1285
- path_in_repo=(
1286
- f"{RENDERS_DIR}/{submission_id}/{fixture_name}.png"
1287
- ),
1288
- repo_id=HF_SUBMISSIONS_REPO,
1289
- repo_type="dataset",
1290
- commit_message=(
1291
- f"add gallery render {fixture_name} for {submission_id}"
 
 
 
 
1292
  ),
 
1293
  )
1294
  logger.info(
1295
  "Uploaded %d gallery render(s) under %s/%s/",
@@ -1306,11 +1415,14 @@ def _cleanup_shard_artifacts(submission_id: str) -> None:
1306
  submission.
1307
  """
1308
  try:
1309
- _HF_API.delete_folder(
1310
- path_in_repo=f"{REPORTS_DIR}/{submission_id}/{SHARDS_SUBDIR}",
1311
- repo_id=HF_SUBMISSIONS_REPO,
1312
- repo_type="dataset",
1313
- commit_message=f"clean up eval shards for {submission_id}",
 
 
 
1314
  )
1315
  logger.info("Cleaned up shard artifacts for %s", submission_id)
1316
  except Exception as e: # noqa: BLE001 - cleanup is best-effort
 
88
  import json
89
  import logging
90
  import os
91
+ import random
92
  import re
93
  import shutil
94
  import tempfile
 
112
  run_job,
113
  snapshot_download,
114
  )
115
+ from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError
116
 
117
  from leaderboard import HF_DATA_REPO, HF_ORG, HF_SUBMISSIONS_REPO
118
 
 
139
  DATA_REV_SHORT_LEN = 12
140
  FAILURE_REASON_MAX_CHARS = 200
141
  SHA256_BLOCK_SIZE = 64 * 1024
142
+
143
+ # Hub-write retry policy. HF rate-limits commits per repo, so a burst
144
+ # of concurrent commits (e.g. bulk baseline seeding writing into this
145
+ # same dataset) can 429 an otherwise-valid submit at the commit step.
146
+ # Retry transient statuses with exponential backoff + jitter, honoring
147
+ # a server Retry-After, up to a wall-clock cap. Applied to every
148
+ # Space-side commit so a busy repo delays a submit instead of failing
149
+ # it.
150
+ _HUB_RETRY_STATUSES = frozenset({429, 500, 502, 503, 504})
151
+ HUB_RETRY_MAX_SECONDS = 120
152
+ HUB_RETRY_BASE_DELAY_SECONDS = 2.0
153
+ HUB_RETRY_MAX_DELAY_SECONDS = 20.0
154
  STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
155
  SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
156
  STUCK_PENDING_REASON = "evaluation interrupted by Space restart"
 
227
  """Raised when a Hub upload fails after validation succeeded."""
228
 
229
 
230
+ def _retry_after_seconds(error: HfHubHTTPError) -> float | None:
231
+ """Parse a ``Retry-After`` header (seconds form) off a Hub error, if any."""
232
+ response = getattr(error, "response", None)
233
+ if response is None:
234
+ return None
235
+ raw = response.headers.get("Retry-After")
236
+ if not raw:
237
+ return None
238
+ try:
239
+ return float(raw)
240
+ except (TypeError, ValueError):
241
+ return None
242
+
243
+
244
+ def _with_hub_retries(fn, *, what: str):
245
+ """Run *fn* (a Hub commit) retrying transient HTTP errors with backoff.
246
+
247
+ Retries only the statuses in :data:`_HUB_RETRY_STATUSES` (rate
248
+ limits + transient 5xx); any other error (auth, validation, a
249
+ ``LookupError`` from a mutate closure) propagates immediately.
250
+ Backoff is exponential with jitter, clamped to
251
+ :data:`HUB_RETRY_MAX_DELAY_SECONDS`, never sleeps past the
252
+ :data:`HUB_RETRY_MAX_SECONDS` wall cap, and honors a server
253
+ ``Retry-After`` when present. *fn* must be idempotent across calls
254
+ -- every caller here re-reads the remote state inside *fn* before
255
+ committing, so a retried commit can't double-apply.
256
+ """
257
+ deadline = time.monotonic() + HUB_RETRY_MAX_SECONDS
258
+ attempt = 0
259
+ while True:
260
+ attempt += 1
261
+ try:
262
+ return fn()
263
+ except HfHubHTTPError as e:
264
+ status = getattr(getattr(e, "response", None), "status_code", None)
265
+ now = time.monotonic()
266
+ if status not in _HUB_RETRY_STATUSES or now >= deadline:
267
+ raise
268
+ delay = min(
269
+ HUB_RETRY_MAX_DELAY_SECONDS,
270
+ HUB_RETRY_BASE_DELAY_SECONDS * (2 ** (attempt - 1)),
271
+ )
272
+ delay = delay / 2 + random.uniform(0, delay / 2)
273
+ retry_after = _retry_after_seconds(e)
274
+ if retry_after is not None:
275
+ delay = max(delay, retry_after)
276
+ delay = min(delay, max(0.0, deadline - now))
277
+ logger.warning(
278
+ "Hub %s got HTTP %s; retry %d in %.1fs (cap %ds)",
279
+ what, status, attempt, delay, HUB_RETRY_MAX_SECONDS,
280
+ )
281
+ time.sleep(delay)
282
+
283
+
284
+ def _submit_status(state: str, message: str) -> str:
285
+ """Markdown for the persistent submit-status panel.
286
+
287
+ The panel is the durable counterpart to the transient ``gr.Info`` /
288
+ ``gr.Error`` toasts: a submitter always sees the current stage and
289
+ any rejection reason without having to catch an ephemeral toast.
290
+ *state* picks the leading glyph (``working`` / ``queued`` /
291
+ ``error``).
292
+ """
293
+ glyph = {"working": "⏳", "queued": "✅", "error": "❌"}.get(state, "•")
294
+ return f"{glyph} {message}"
295
+
296
+
297
  def handle_submit(
298
  zip_file,
299
  profile: gr.OAuthProfile | None,
300
+ ):
301
+ """Validate + queue a submission, streaming status to a panel.
302
 
303
  Requires the user to be logged in via ``gr.LoginButton`` so the
304
  row's ``hf_username`` is the canonical HF identity (not a
305
  free-text claim). The submit button in :mod:`app` is disabled
306
+ until login completes; this generator also rejects defensively if
307
+ it's called without a profile so a UI mishap can't write an
308
+ anonymous row.
309
+
310
+ Generator: each ``yield`` is a Markdown string pushed to a
311
+ persistent status panel, the durable counterpart to the transient
312
+ toasts. Happy-path stages: validating -> uploading/queuing ->
313
+ queued. Every rejection (login-missing, form-level, validation
314
+ gate, dedup, Hub write) yields a final error panel **and** raises
315
+ ``gr.Error`` for a toast; the outer ``try/finally`` still runs to
316
+ clean up the temp dir. The Hub writes inside ride out transient
317
+ rate limits via :func:`_with_hub_retries`, so a busy submissions
318
+ repo delays rather than fails the submit.
 
 
 
 
 
 
 
 
319
  """
320
  if profile is None:
321
+ msg = "Please log in via the HF button before submitting."
322
+ yield _submit_status("error", msg)
323
+ raise gr.Error(msg)
324
  form_err = _validate_form(zip_file)
325
  if form_err is not None:
326
+ yield _submit_status("error", form_err)
327
  raise gr.Error(form_err)
328
 
329
+ yield _submit_status(
330
+ "working",
331
+ "Validating submission (unpacking the zip, checking the fixture set "
332
+ "and STEP files)…",
 
 
333
  )
334
 
335
  zip_path = Path(zip_file.name)
 
349
  fixture_names = _validate_fixture_set(run_dir)
350
  _validate_steps_parseable(run_dir, fixture_names)
351
  except _ValidationError as e:
352
+ msg = f"Submission rejected: {e}"
353
+ yield _submit_status("error", msg)
354
+ raise gr.Error(msg)
355
 
356
  # Dedup gate: hash the raw zip bytes and reject if an existing
357
  # row carries the same hash. Runs after validation so a clearly
 
359
  zip_sha256 = _compute_sha256(zip_path)
360
  existing_id = _find_existing_submission_by_sha256(zip_sha256)
361
  if existing_id is not None:
362
+ msg = (
363
  f"This zip's contents are identical to an existing "
364
  f"submission ({existing_id}). Resubmit only after changing "
365
  f"at least one byte of the upload."
366
  )
367
+ yield _submit_status("error", msg)
368
+ raise gr.Error(msg)
369
 
370
  submission_id = _mint_submission_id(
371
  meta["submitter_name"], meta["submission_name"]
372
  )
373
+ yield _submit_status(
374
+ "working",
375
+ f"Uploading `{submission_id}` ({len(fixture_names)} fixtures) and "
376
+ f"queuing the evaluation… (this can take a moment, and retries "
377
+ f"automatically if the Hub is busy).",
378
+ )
379
  try:
380
  blob_url = _upload_submission_zip(submission_id, zip_path)
381
  row = _build_pending_row(
 
384
  )
385
  _append_pending_row(row)
386
  except _HubWriteError as e:
387
+ msg = f"Submission rejected: {e}"
388
+ yield _submit_status("error", msg)
389
+ raise gr.Error(msg)
390
 
391
  _spawn_worker(submission_id, blob_url, sorted(fixture_names))
392
+ yield _submit_status(
393
+ "queued",
394
+ f"Submission `{submission_id}` queued ({len(fixture_names)} "
395
+ f"fixtures). The eval runs on an HF Jobs GPU; your row appears on "
396
+ f"the **Unvalidated** leaderboard and flips to completed when the "
397
+ f"job finishes (typically 1–3 minutes).",
398
  )
399
  finally:
400
  shutil.rmtree(tmp, ignore_errors=True)
 
595
  """
596
  repo_path = f"{SUBMISSIONS_DIR}/{submission_id}.zip"
597
  try:
598
+ _with_hub_retries(
599
+ lambda: _HF_API.upload_file(
600
+ path_or_fileobj=str(zip_path),
601
+ path_in_repo=repo_path,
602
+ repo_id=HF_SUBMISSIONS_REPO,
603
+ repo_type="dataset",
604
+ commit_message=f"add submission zip for {submission_id}",
605
+ ),
606
+ what="submission-zip upload",
607
  )
608
  except Exception as e: # noqa: BLE001 - Hub API surface is broad
609
  logger.exception("Failed to upload submission zip %s", submission_id)
 
756
  The lock is held only for the read-modify-write cycle (~1-2s),
757
  never for eval time. Concurrent submitters serialise here, not
758
  in the eval pipeline. Treats a missing file as the empty list.
759
+
760
+ The whole download->mutate->upload cycle is retried as a unit on a
761
+ transient Hub error (:func:`_with_hub_retries`): re-reading the
762
+ current file each attempt keeps the mutation idempotent and also
763
+ folds in any concurrent change, so a rate-limited commit waits and
764
+ retries instead of failing the caller.
765
  """
766
+ def _download_mutate_upload() -> None:
767
  existing = _download_results_jsonl()
768
  rows: list[dict[str, Any]] = [
769
  json.loads(line) for line in existing.splitlines() if line.strip()
 
782
  commit_message=commit_message,
783
  )
784
 
785
+ with _HUB_LOCK:
786
+ _with_hub_retries(_download_mutate_upload, what="results.jsonl write")
787
+
788
 
789
  def _download_results_jsonl() -> str:
790
  """Fetch the current ``results.jsonl`` body as text, or ``""`` if absent."""
 
1337
  artifacts land at the exact paths the leaderboard + the row-flip
1338
  expect. Uses the process ``HfApi`` (Space HF_TOKEN env).
1339
  """
1340
+ _with_hub_retries(
1341
+ lambda: _HF_API.upload_file(
1342
+ path_or_fileobj=str(html_path),
1343
+ path_in_repo=f"{REPORTS_DIR}/{submission_id}.html",
1344
+ repo_id=HF_SUBMISSIONS_REPO,
1345
+ repo_type="dataset",
1346
+ commit_message=f"add merged HTML report for {submission_id}",
1347
+ ),
1348
+ what="report-html upload",
1349
  )
1350
+ _with_hub_retries(
1351
+ lambda: _HF_API.upload_file(
1352
+ path_or_fileobj=json.dumps(
1353
+ report_json, ensure_ascii=False, indent=2,
1354
+ ).encode("utf-8"),
1355
+ path_in_repo=f"{REPORTS_DIR}/{submission_id}.json",
1356
+ repo_id=HF_SUBMISSIONS_REPO,
1357
+ repo_type="dataset",
1358
+ commit_message=f"add merged JSON report for {submission_id}",
1359
+ ),
1360
+ what="report-json upload",
1361
  )
1362
  logger.info("Uploaded merged reports/%s.{html,json}", submission_id)
1363
 
 
1384
  logger.info("No gallery renders to upload for %s", submission_id)
1385
  return
1386
  for iso_png, fixture_name in staged:
1387
+ _with_hub_retries(
1388
+ lambda iso_png=iso_png, fixture_name=fixture_name: (
1389
+ _HF_API.upload_file(
1390
+ path_or_fileobj=str(iso_png),
1391
+ path_in_repo=(
1392
+ f"{RENDERS_DIR}/{submission_id}/{fixture_name}.png"
1393
+ ),
1394
+ repo_id=HF_SUBMISSIONS_REPO,
1395
+ repo_type="dataset",
1396
+ commit_message=(
1397
+ f"add gallery render {fixture_name} for {submission_id}"
1398
+ ),
1399
+ )
1400
  ),
1401
+ what="gallery-render upload",
1402
  )
1403
  logger.info(
1404
  "Uploaded %d gallery render(s) under %s/%s/",
 
1415
  submission.
1416
  """
1417
  try:
1418
+ _with_hub_retries(
1419
+ lambda: _HF_API.delete_folder(
1420
+ path_in_repo=f"{REPORTS_DIR}/{submission_id}/{SHARDS_SUBDIR}",
1421
+ repo_id=HF_SUBMISSIONS_REPO,
1422
+ repo_type="dataset",
1423
+ commit_message=f"clean up eval shards for {submission_id}",
1424
+ ),
1425
+ what="shard cleanup",
1426
  )
1427
  logger.info("Cleaned up shard artifacts for %s", submission_id)
1428
  except Exception as e: # noqa: BLE001 - cleanup is best-effort
tests/test_submit.py CHANGED
@@ -12,10 +12,92 @@ network traffic.
12
  from __future__ import annotations
13
 
14
  from pathlib import Path
 
 
 
15
 
16
  import submit
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def _stub_meta() -> dict:
20
  """Minimum meta.json shape that survives ``_load_and_validate_meta``."""
21
  return {
 
12
  from __future__ import annotations
13
 
14
  from pathlib import Path
15
+ from types import SimpleNamespace
16
+
17
+ import pytest
18
 
19
  import submit
20
 
21
 
22
+ def _hub_http_error(status: int, headers: dict | None = None) -> submit.HfHubHTTPError:
23
+ """An ``HfHubHTTPError`` with a minimal response carrying *status*.
24
+
25
+ Built without going through the real Hub: construct the exception
26
+ with no response, then attach a stand-in so ``_with_hub_retries``
27
+ can read ``response.status_code`` / ``response.headers``.
28
+ """
29
+ err = submit.HfHubHTTPError(f"HTTP {status}")
30
+ err.response = SimpleNamespace(status_code=status, headers=headers or {})
31
+ return err
32
+
33
+
34
+ def test_with_hub_retries_recovers_after_transient(monkeypatch):
35
+ """A 429 (then a 503) is retried; the eventual success is returned."""
36
+ monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
37
+ statuses = iter([429, 503])
38
+ calls = {"n": 0}
39
+
40
+ def flaky():
41
+ calls["n"] += 1
42
+ status = next(statuses, None)
43
+ if status is not None:
44
+ raise _hub_http_error(status)
45
+ return "ok"
46
+
47
+ assert submit._with_hub_retries(flaky, what="test") == "ok"
48
+ assert calls["n"] == 3
49
+
50
+
51
+ def test_with_hub_retries_reraises_non_retryable(monkeypatch):
52
+ """A 403 is not in the retry set, so it propagates on the first try."""
53
+ monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
54
+ calls = {"n": 0}
55
+
56
+ def forbidden():
57
+ calls["n"] += 1
58
+ raise _hub_http_error(403)
59
+
60
+ with pytest.raises(submit.HfHubHTTPError):
61
+ submit._with_hub_retries(forbidden, what="test")
62
+ assert calls["n"] == 1
63
+
64
+
65
+ def test_with_hub_retries_gives_up_after_wall_cap(monkeypatch):
66
+ """Past the wall cap, a persistent 429 stops being retried and raises."""
67
+ monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
68
+ # Force the deadline check to trip after the first failure: the
69
+ # second monotonic() read (post-failure) already exceeds the cap.
70
+ ticks = iter([0.0, submit.HUB_RETRY_MAX_SECONDS + 1])
71
+ monkeypatch.setattr(
72
+ submit.time, "monotonic", lambda: next(ticks, submit.HUB_RETRY_MAX_SECONDS + 1)
73
+ )
74
+ calls = {"n": 0}
75
+
76
+ def always_429():
77
+ calls["n"] += 1
78
+ raise _hub_http_error(429)
79
+
80
+ with pytest.raises(submit.HfHubHTTPError):
81
+ submit._with_hub_retries(always_429, what="test")
82
+ assert calls["n"] == 1
83
+
84
+
85
+ def test_retry_after_header_is_honored(monkeypatch):
86
+ """A ``Retry-After`` seconds value sets the floor for the sleep delay."""
87
+ slept: list[float] = []
88
+ monkeypatch.setattr(submit.time, "sleep", lambda d: slept.append(d))
89
+ calls = {"n": 0}
90
+
91
+ def flaky():
92
+ calls["n"] += 1
93
+ if calls["n"] == 1:
94
+ raise _hub_http_error(429, headers={"Retry-After": "7"})
95
+ return "ok"
96
+
97
+ assert submit._with_hub_retries(flaky, what="test") == "ok"
98
+ assert slept and slept[0] >= 7.0
99
+
100
+
101
  def _stub_meta() -> dict:
102
  """Minimum meta.json shape that survives ``_load_and_validate_meta``."""
103
  return {