submit: reject byte-identical re-submits via SHA-256 dedup
Browse filesBefore: clicking Submit twice (or re-uploading the same zip after
a transient confusion) created two rows pointing at byte-identical
content, ran the eval twice for the same input, and bloated the
submissions/reports blobs.
After: the cheap-sync handler computes sha256 of the uploaded zip
right after validation passes, queries results.jsonl for any row
with the same submission_sha256, and rejects with a pointer to
the existing submission_id if a match is found. Hash also lands
on the new row's submission_sha256 field (schema bump in the
companion commit on the submissions dataset).
Dedup-fetch failures fall back to letting the submit through
rather than rejecting on a Hub blip (logged). Race window: two
near-simultaneous identical submits whose dedup checks both
return None before either appends will land two rows; recoverable
by manual cleanup if it ever happens, not worth tightening before
auth + login throttling lands at pre-launch.
Drive-by: _build_pending_row's unused fixture_names arg (already
noqa'd as "kept for future") gets dropped. Per code-review pass.
|
@@ -57,6 +57,7 @@ Background worker, per submission:
|
|
| 57 |
"""
|
| 58 |
from __future__ import annotations
|
| 59 |
|
|
|
|
| 60 |
import json
|
| 61 |
import logging
|
| 62 |
import os
|
|
@@ -98,6 +99,7 @@ FAILURE_REASON_MAX_CHARS = 200
|
|
| 98 |
EVAL_TIMEOUT_SECONDS = 15 * 60
|
| 99 |
REPORT_TIMEOUT_SECONDS = 2 * 60
|
| 100 |
EVAL_WORKER_COUNT = "8"
|
|
|
|
| 101 |
|
| 102 |
# One HfApi client per process. HF_TOKEN is picked up from the env at
|
| 103 |
# construction time and reused for every call.
|
|
@@ -154,13 +156,25 @@ def handle_submit(zip_file) -> str:
|
|
| 154 |
except _ValidationError as e:
|
| 155 |
return f"**Submission rejected.** {e}"
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
submission_id = _mint_submission_id(
|
| 158 |
meta["submitter_name"], meta["submission_name"]
|
| 159 |
)
|
| 160 |
try:
|
| 161 |
blob_url = _upload_submission_zip(submission_id, zip_path)
|
| 162 |
row = _build_pending_row(
|
| 163 |
-
submission_id, meta,
|
| 164 |
)
|
| 165 |
_append_pending_row(row)
|
| 166 |
except _HubWriteError as e:
|
|
@@ -370,14 +384,15 @@ def _upload_submission_zip(submission_id: str, zip_path: Path) -> str:
|
|
| 370 |
def _build_pending_row(
|
| 371 |
submission_id: str,
|
| 372 |
meta: dict[str, Any],
|
| 373 |
-
fixture_names: set[str], # noqa: ARG001 - kept for future per-fixture pre-fill
|
| 374 |
blob_url: str,
|
|
|
|
| 375 |
) -> dict[str, Any]:
|
| 376 |
"""Construct the JSON row written for a freshly-queued submission.
|
| 377 |
|
| 378 |
Mirrors the pending regime in ``cadgenbench-submissions/schema.md``:
|
| 379 |
-
metadata + ``status: pending`` + ``submission_blob_url``
|
| 380 |
-
score-shaped field is ``null`` until
|
|
|
|
| 381 |
"""
|
| 382 |
return {
|
| 383 |
"submission_id": submission_id,
|
|
@@ -397,9 +412,47 @@ def _build_pending_row(
|
|
| 397 |
"per_fixture_scores": None,
|
| 398 |
"per_fixture_breakdown": None,
|
| 399 |
"submission_blob_url": blob_url,
|
|
|
|
| 400 |
}
|
| 401 |
|
| 402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
def _append_pending_row(row: dict[str, Any]) -> None:
|
| 404 |
"""Append a pending row to ``results.jsonl`` on the Hub under the lock."""
|
| 405 |
submission_id = row["submission_id"]
|
|
|
|
| 57 |
"""
|
| 58 |
from __future__ import annotations
|
| 59 |
|
| 60 |
+
import hashlib
|
| 61 |
import json
|
| 62 |
import logging
|
| 63 |
import os
|
|
|
|
| 99 |
EVAL_TIMEOUT_SECONDS = 15 * 60
|
| 100 |
REPORT_TIMEOUT_SECONDS = 2 * 60
|
| 101 |
EVAL_WORKER_COUNT = "8"
|
| 102 |
+
SHA256_BLOCK_SIZE = 64 * 1024
|
| 103 |
|
| 104 |
# One HfApi client per process. HF_TOKEN is picked up from the env at
|
| 105 |
# construction time and reused for every call.
|
|
|
|
| 156 |
except _ValidationError as e:
|
| 157 |
return f"**Submission rejected.** {e}"
|
| 158 |
|
| 159 |
+
# Dedup gate: hash the raw zip bytes and reject if an existing
|
| 160 |
+
# row carries the same hash. Runs after validation so a clearly
|
| 161 |
+
# malformed upload still gets the specific validation error.
|
| 162 |
+
zip_sha256 = _compute_sha256(zip_path)
|
| 163 |
+
existing_id = _find_existing_submission_by_sha256(zip_sha256)
|
| 164 |
+
if existing_id is not None:
|
| 165 |
+
return (
|
| 166 |
+
f"**Submission rejected.** This zip's contents are identical "
|
| 167 |
+
f"to an existing submission (`{existing_id}`). Resubmit only "
|
| 168 |
+
f"after changing at least one byte of the upload."
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
submission_id = _mint_submission_id(
|
| 172 |
meta["submitter_name"], meta["submission_name"]
|
| 173 |
)
|
| 174 |
try:
|
| 175 |
blob_url = _upload_submission_zip(submission_id, zip_path)
|
| 176 |
row = _build_pending_row(
|
| 177 |
+
submission_id, meta, blob_url, zip_sha256
|
| 178 |
)
|
| 179 |
_append_pending_row(row)
|
| 180 |
except _HubWriteError as e:
|
|
|
|
| 384 |
def _build_pending_row(
|
| 385 |
submission_id: str,
|
| 386 |
meta: dict[str, Any],
|
|
|
|
| 387 |
blob_url: str,
|
| 388 |
+
submission_sha256: str,
|
| 389 |
) -> dict[str, Any]:
|
| 390 |
"""Construct the JSON row written for a freshly-queued submission.
|
| 391 |
|
| 392 |
Mirrors the pending regime in ``cadgenbench-submissions/schema.md``:
|
| 393 |
+
metadata + ``status: pending`` + ``submission_blob_url`` +
|
| 394 |
+
``submission_sha256``; every score-shaped field is ``null`` until
|
| 395 |
+
the worker flips the row.
|
| 396 |
"""
|
| 397 |
return {
|
| 398 |
"submission_id": submission_id,
|
|
|
|
| 412 |
"per_fixture_scores": None,
|
| 413 |
"per_fixture_breakdown": None,
|
| 414 |
"submission_blob_url": blob_url,
|
| 415 |
+
"submission_sha256": submission_sha256,
|
| 416 |
}
|
| 417 |
|
| 418 |
|
| 419 |
+
def _compute_sha256(path: Path) -> str:
|
| 420 |
+
"""Hex-encoded SHA-256 of the file at *path*. Streams in 64 KiB chunks."""
|
| 421 |
+
h = hashlib.sha256()
|
| 422 |
+
with path.open("rb") as f:
|
| 423 |
+
for chunk in iter(lambda: f.read(SHA256_BLOCK_SIZE), b""):
|
| 424 |
+
h.update(chunk)
|
| 425 |
+
return h.hexdigest()
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
def _find_existing_submission_by_sha256(zip_sha256: str) -> str | None:
|
| 429 |
+
"""Return the ``submission_id`` of an existing row with the same hash, else None.
|
| 430 |
+
|
| 431 |
+
Reads the current ``results.jsonl`` once (no lock; a worst-case
|
| 432 |
+
race lands a duplicate row, which is recoverable by a cleanup pass
|
| 433 |
+
if it ever happens). Hub-fetch failures are non-fatal: the caller
|
| 434 |
+
just doesn't get the dedup gate this submit (logged).
|
| 435 |
+
"""
|
| 436 |
+
try:
|
| 437 |
+
body = _download_results_jsonl()
|
| 438 |
+
except Exception as e: # noqa: BLE001 - Hub API surface is broad
|
| 439 |
+
logger.warning(
|
| 440 |
+
"Dedup check skipped, Hub fetch failed (%s: %s)",
|
| 441 |
+
type(e).__name__, e,
|
| 442 |
+
)
|
| 443 |
+
return None
|
| 444 |
+
for line in body.splitlines():
|
| 445 |
+
if not line.strip():
|
| 446 |
+
continue
|
| 447 |
+
try:
|
| 448 |
+
row = json.loads(line)
|
| 449 |
+
except json.JSONDecodeError:
|
| 450 |
+
continue
|
| 451 |
+
if row.get("submission_sha256") == zip_sha256:
|
| 452 |
+
return row.get("submission_id")
|
| 453 |
+
return None
|
| 454 |
+
|
| 455 |
+
|
| 456 |
def _append_pending_row(row: dict[str, Any]) -> None:
|
| 457 |
"""Append a pending row to ``results.jsonl`` on the Hub under the lock."""
|
| 458 |
submission_id = row["submission_id"]
|