Michael Rabinovich commited on
Commit
3237736
·
1 Parent(s): 76f0611

submit: reject byte-identical re-submits via SHA-256 dedup

Browse files

Before: clicking Submit twice (or re-uploading the same zip after
a transient confusion) created two rows pointing at byte-identical
content, ran the eval twice for the same input, and bloated the
submissions/reports blobs.

After: the cheap-sync handler computes sha256 of the uploaded zip
right after validation passes, queries results.jsonl for any row
with the same submission_sha256, and rejects with a pointer to
the existing submission_id if a match is found. Hash also lands
on the new row's submission_sha256 field (schema bump in the
companion commit on the submissions dataset).

Dedup-fetch failures fall back to letting the submit through
rather than rejecting on a Hub blip (logged). Race window: two
near-simultaneous identical submits whose dedup checks both
return None before either appends will land two rows; recoverable
by manual cleanup if it ever happens, not worth tightening before
auth + login throttling lands at pre-launch.

Drive-by: _build_pending_row's unused fixture_names arg (already
noqa'd as "kept for future") gets dropped. Per code-review pass.

Files changed (1) hide show
  1. submit.py +57 -4
submit.py CHANGED
@@ -57,6 +57,7 @@ Background worker, per submission:
57
  """
58
  from __future__ import annotations
59
 
 
60
  import json
61
  import logging
62
  import os
@@ -98,6 +99,7 @@ FAILURE_REASON_MAX_CHARS = 200
98
  EVAL_TIMEOUT_SECONDS = 15 * 60
99
  REPORT_TIMEOUT_SECONDS = 2 * 60
100
  EVAL_WORKER_COUNT = "8"
 
101
 
102
  # One HfApi client per process. HF_TOKEN is picked up from the env at
103
  # construction time and reused for every call.
@@ -154,13 +156,25 @@ def handle_submit(zip_file) -> str:
154
  except _ValidationError as e:
155
  return f"**Submission rejected.** {e}"
156
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  submission_id = _mint_submission_id(
158
  meta["submitter_name"], meta["submission_name"]
159
  )
160
  try:
161
  blob_url = _upload_submission_zip(submission_id, zip_path)
162
  row = _build_pending_row(
163
- submission_id, meta, fixture_names, blob_url
164
  )
165
  _append_pending_row(row)
166
  except _HubWriteError as e:
@@ -370,14 +384,15 @@ def _upload_submission_zip(submission_id: str, zip_path: Path) -> str:
370
  def _build_pending_row(
371
  submission_id: str,
372
  meta: dict[str, Any],
373
- fixture_names: set[str], # noqa: ARG001 - kept for future per-fixture pre-fill
374
  blob_url: str,
 
375
  ) -> dict[str, Any]:
376
  """Construct the JSON row written for a freshly-queued submission.
377
 
378
  Mirrors the pending regime in ``cadgenbench-submissions/schema.md``:
379
- metadata + ``status: pending`` + ``submission_blob_url``; every
380
- score-shaped field is ``null`` until the worker flips the row.
 
381
  """
382
  return {
383
  "submission_id": submission_id,
@@ -397,9 +412,47 @@ def _build_pending_row(
397
  "per_fixture_scores": None,
398
  "per_fixture_breakdown": None,
399
  "submission_blob_url": blob_url,
 
400
  }
401
 
402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  def _append_pending_row(row: dict[str, Any]) -> None:
404
  """Append a pending row to ``results.jsonl`` on the Hub under the lock."""
405
  submission_id = row["submission_id"]
 
57
  """
58
  from __future__ import annotations
59
 
60
+ import hashlib
61
  import json
62
  import logging
63
  import os
 
99
  EVAL_TIMEOUT_SECONDS = 15 * 60
100
  REPORT_TIMEOUT_SECONDS = 2 * 60
101
  EVAL_WORKER_COUNT = "8"
102
+ SHA256_BLOCK_SIZE = 64 * 1024
103
 
104
  # One HfApi client per process. HF_TOKEN is picked up from the env at
105
  # construction time and reused for every call.
 
156
  except _ValidationError as e:
157
  return f"**Submission rejected.** {e}"
158
 
159
+ # Dedup gate: hash the raw zip bytes and reject if an existing
160
+ # row carries the same hash. Runs after validation so a clearly
161
+ # malformed upload still gets the specific validation error.
162
+ zip_sha256 = _compute_sha256(zip_path)
163
+ existing_id = _find_existing_submission_by_sha256(zip_sha256)
164
+ if existing_id is not None:
165
+ return (
166
+ f"**Submission rejected.** This zip's contents are identical "
167
+ f"to an existing submission (`{existing_id}`). Resubmit only "
168
+ f"after changing at least one byte of the upload."
169
+ )
170
+
171
  submission_id = _mint_submission_id(
172
  meta["submitter_name"], meta["submission_name"]
173
  )
174
  try:
175
  blob_url = _upload_submission_zip(submission_id, zip_path)
176
  row = _build_pending_row(
177
+ submission_id, meta, blob_url, zip_sha256
178
  )
179
  _append_pending_row(row)
180
  except _HubWriteError as e:
 
384
  def _build_pending_row(
385
  submission_id: str,
386
  meta: dict[str, Any],
 
387
  blob_url: str,
388
+ submission_sha256: str,
389
  ) -> dict[str, Any]:
390
  """Construct the JSON row written for a freshly-queued submission.
391
 
392
  Mirrors the pending regime in ``cadgenbench-submissions/schema.md``:
393
+ metadata + ``status: pending`` + ``submission_blob_url`` +
394
+ ``submission_sha256``; every score-shaped field is ``null`` until
395
+ the worker flips the row.
396
  """
397
  return {
398
  "submission_id": submission_id,
 
412
  "per_fixture_scores": None,
413
  "per_fixture_breakdown": None,
414
  "submission_blob_url": blob_url,
415
+ "submission_sha256": submission_sha256,
416
  }
417
 
418
 
419
+ def _compute_sha256(path: Path) -> str:
420
+ """Hex-encoded SHA-256 of the file at *path*. Streams in 64 KiB chunks."""
421
+ h = hashlib.sha256()
422
+ with path.open("rb") as f:
423
+ for chunk in iter(lambda: f.read(SHA256_BLOCK_SIZE), b""):
424
+ h.update(chunk)
425
+ return h.hexdigest()
426
+
427
+
428
+ def _find_existing_submission_by_sha256(zip_sha256: str) -> str | None:
429
+ """Return the ``submission_id`` of an existing row with the same hash, else None.
430
+
431
+ Reads the current ``results.jsonl`` once (no lock; a worst-case
432
+ race lands a duplicate row, which is recoverable by a cleanup pass
433
+ if it ever happens). Hub-fetch failures are non-fatal: the caller
434
+ just doesn't get the dedup gate this submit (logged).
435
+ """
436
+ try:
437
+ body = _download_results_jsonl()
438
+ except Exception as e: # noqa: BLE001 - Hub API surface is broad
439
+ logger.warning(
440
+ "Dedup check skipped, Hub fetch failed (%s: %s)",
441
+ type(e).__name__, e,
442
+ )
443
+ return None
444
+ for line in body.splitlines():
445
+ if not line.strip():
446
+ continue
447
+ try:
448
+ row = json.loads(line)
449
+ except json.JSONDecodeError:
450
+ continue
451
+ if row.get("submission_sha256") == zip_sha256:
452
+ return row.get("submission_id")
453
+ return None
454
+
455
+
456
  def _append_pending_row(row: dict[str, Any]) -> None:
457
  """Append a pending row to ``results.jsonl`` on the Hub under the lock."""
458
  submission_id = row["submission_id"]