Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

Michael Rabinovich Cursor commited on Jun 8

Commit

be6fa3d

1 Parent(s): 335cdfa

Rename fixture-named result keys to sample-named keys

Read/write per_sample_scores, per_sample_breakdown, n_samples,
per_sample_results across submit/gallery/admin and tools. Also link the
canonical submission docs from the Submit tab.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (7) hide show

admin.py +2 -2
app.py +11 -0
gallery.py +1 -1
submit.py +4 -4
tests/test_admin.py +2 -2
tests/test_submit.py +2 -2
tools/pick_gallery_fixtures.py +1 -1

admin.py CHANGED Viewed

@@ -78,8 +78,8 @@ _RESCORE_CLEARED_SCORE_FIELDS: tuple[str, ...] = (
     "validity_rate",
     "score_by_task_type",
     "per_task_scores",
-    "per_fixture_scores",
-    "per_fixture_breakdown",
 )
 # Gap between successive worker dispatches in a bulk rescore. Each

     "validity_rate",
     "score_by_task_type",
     "per_task_scores",
+    "per_sample_scores",
+    "per_sample_breakdown",
 )
 # Gap between successive worker dispatches in a bulk rescore. Each

app.py CHANGED Viewed

@@ -92,6 +92,13 @@ logging.basicConfig(
 VALIDATION_DOC_URL = (
     "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
 )
 ABOUT_MD = f"""## About
@@ -1118,6 +1125,10 @@ not in the main leaderboard table.
 **Consent.** `"agree_to_publish": true` in `meta.json` is your consent
 to publish the resulting row on the public leaderboard.
 """
         )
         # OAuth gate. The user must log in via the HF button before

 VALIDATION_DOC_URL = (
     "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
 )
+# Canonical submission contract (output layout, validity gate, canonical
+# pose, local self-check). Linked from the Submit tab so the tab itself
+# stays a short "how to package + upload" note rather than re-documenting
+# the full contract.
+SUBMISSION_DOC_URL = (
+    "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/submission.md"
+)
 ABOUT_MD = f"""## About
 **Consent.** `"agree_to_publish": true` in `meta.json` is your consent
 to publish the resulting row on the public leaderboard.
+For the full submission contract (output format, validity gate, canonical
+pose, and a local self-check), see
+[`docs/benchmark/submission.md`]({SUBMISSION_DOC_URL}).
 """
         )
         # OAuth gate. The user must log in via the HF button before

gallery.py CHANGED Viewed

@@ -115,7 +115,7 @@ def _sub_payload(row: dict, fixture_ids: list[str], render_resolver, diff_resolv
     grid never changes the modal.
     """
     by_task = row.get("score_by_task_type") or {}
-    pfs = row.get("per_fixture_scores") or {}
     sid = row.get("submission_id") or ""
     cells: dict[str, dict] = {}
     for fid in fixture_ids:

     grid never changes the modal.
     """
     by_task = row.get("score_by_task_type") or {}
+    pfs = row.get("per_sample_scores") or {}
     sid = row.get("submission_id") or ""
     cells: dict[str, dict] = {}
     for fid in fixture_ids:

submit.py CHANGED Viewed

@@ -827,8 +827,8 @@ def _build_pending_row(
         "aggregate_score": None,
         "score_by_task_type": None,
         "per_task_scores": None,
-        "per_fixture_scores": None,
-        "per_fixture_breakdown": None,
         "submission_blob_url": blob_url,
         "submission_sha256": submission_sha256,
         "validation_status": "unvalidated",
@@ -1629,7 +1629,7 @@ def _build_report_json(run_dir: Path) -> dict[str, Any]:
             per_fixture[fixture_dir.name] = json.loads(
                 rp.read_text(encoding="utf-8")
             )
-    return {"run_summary": summary, "per_fixture_results": per_fixture}
 def _publish_reports_and_gallery(
@@ -1725,7 +1725,7 @@ def _flip_row_to_completed(submission_id: str, summary: dict[str, Any]) -> None:
         "validity_rate": summary.get("validity_rate"),
         "score_by_task_type": summary.get("score_by_task_type"),
         "per_task_scores": summary.get("per_task_scores"),
-        "per_fixture_scores": summary.get("per_fixture_scores"),
     }
     _update_row(submission_id, updates)

         "aggregate_score": None,
         "score_by_task_type": None,
         "per_task_scores": None,
+        "per_sample_scores": None,
+        "per_sample_breakdown": None,
         "submission_blob_url": blob_url,
         "submission_sha256": submission_sha256,
         "validation_status": "unvalidated",
             per_fixture[fixture_dir.name] = json.loads(
                 rp.read_text(encoding="utf-8")
             )
+    return {"run_summary": summary, "per_sample_results": per_fixture}
 def _publish_reports_and_gallery(
         "validity_rate": summary.get("validity_rate"),
         "score_by_task_type": summary.get("score_by_task_type"),
         "per_task_scores": summary.get("per_task_scores"),
+        "per_sample_scores": summary.get("per_sample_scores"),
     }
     _update_row(submission_id, updates)

tests/test_admin.py CHANGED Viewed

@@ -312,8 +312,8 @@ RESCORE_ROWS = [
         "validity_rate": 1.0,
         "score_by_task_type": {"generation": 0.7},
         "per_task_scores": {"generation": {"score": 0.7}},
-        "per_fixture_scores": {"f1": {"cad_score": 0.7}},
-        "per_fixture_breakdown": {"f1": {"validity": 1.0}},
     },
     {
         "submission_id": "broke",

         "validity_rate": 1.0,
         "score_by_task_type": {"generation": 0.7},
         "per_task_scores": {"generation": {"score": 0.7}},
+        "per_sample_scores": {"f1": {"cad_score": 0.7}},
+        "per_sample_breakdown": {"f1": {"validity": 1.0}},
     },
     {
         "submission_id": "broke",

tests/test_submit.py CHANGED Viewed

@@ -386,8 +386,8 @@ def test_pending_row_preserves_existing_metadata(monkeypatch):
         "validity_rate",
         "score_by_task_type",
         "per_task_scores",
-        "per_fixture_scores",
-        "per_fixture_breakdown",
     ):
         assert row[k] is None

         "validity_rate",
         "score_by_task_type",
         "per_task_scores",
+        "per_sample_scores",
+        "per_sample_breakdown",
     ):
         assert row[k] is None

tools/pick_gallery_fixtures.py CHANGED Viewed

@@ -95,7 +95,7 @@ def main() -> int:
     print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")
     by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
-    for fid, fx in (ref.get("per_fixture_scores") or {}).items():
         fx = fx or {}
         if fx.get("status") == "valid" and fx.get("cad_score") is not None:
             by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))

     print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")
     by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
+    for fid, fx in (ref.get("per_sample_scores") or {}).items():
         fx = fx or {}
         if fx.get("status") == "valid" and fx.get("cad_score") is not None:
             by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))