Michael Rabinovich Cursor commited on
Commit ·
be6fa3d
1
Parent(s): 335cdfa
Rename fixture-named result keys to sample-named keys
Browse filesRead/write per_sample_scores, per_sample_breakdown, n_samples,
per_sample_results across submit/gallery/admin and tools. Also link the
canonical submission docs from the Submit tab.
Co-authored-by: Cursor <cursoragent@cursor.com>
- admin.py +2 -2
- app.py +11 -0
- gallery.py +1 -1
- submit.py +4 -4
- tests/test_admin.py +2 -2
- tests/test_submit.py +2 -2
- tools/pick_gallery_fixtures.py +1 -1
admin.py
CHANGED
|
@@ -78,8 +78,8 @@ _RESCORE_CLEARED_SCORE_FIELDS: tuple[str, ...] = (
|
|
| 78 |
"validity_rate",
|
| 79 |
"score_by_task_type",
|
| 80 |
"per_task_scores",
|
| 81 |
-
"
|
| 82 |
-
"
|
| 83 |
)
|
| 84 |
|
| 85 |
# Gap between successive worker dispatches in a bulk rescore. Each
|
|
|
|
| 78 |
"validity_rate",
|
| 79 |
"score_by_task_type",
|
| 80 |
"per_task_scores",
|
| 81 |
+
"per_sample_scores",
|
| 82 |
+
"per_sample_breakdown",
|
| 83 |
)
|
| 84 |
|
| 85 |
# Gap between successive worker dispatches in a bulk rescore. Each
|
app.py
CHANGED
|
@@ -92,6 +92,13 @@ logging.basicConfig(
|
|
| 92 |
VALIDATION_DOC_URL = (
|
| 93 |
"https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
|
| 94 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
ABOUT_MD = f"""## About
|
| 97 |
|
|
@@ -1118,6 +1125,10 @@ not in the main leaderboard table.
|
|
| 1118 |
|
| 1119 |
**Consent.** `"agree_to_publish": true` in `meta.json` is your consent
|
| 1120 |
to publish the resulting row on the public leaderboard.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1121 |
"""
|
| 1122 |
)
|
| 1123 |
# OAuth gate. The user must log in via the HF button before
|
|
|
|
| 92 |
VALIDATION_DOC_URL = (
|
| 93 |
"https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
|
| 94 |
)
|
| 95 |
+
# Canonical submission contract (output layout, validity gate, canonical
|
| 96 |
+
# pose, local self-check). Linked from the Submit tab so the tab itself
|
| 97 |
+
# stays a short "how to package + upload" note rather than re-documenting
|
| 98 |
+
# the full contract.
|
| 99 |
+
SUBMISSION_DOC_URL = (
|
| 100 |
+
"https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/submission.md"
|
| 101 |
+
)
|
| 102 |
|
| 103 |
ABOUT_MD = f"""## About
|
| 104 |
|
|
|
|
| 1125 |
|
| 1126 |
**Consent.** `"agree_to_publish": true` in `meta.json` is your consent
|
| 1127 |
to publish the resulting row on the public leaderboard.
|
| 1128 |
+
|
| 1129 |
+
For the full submission contract (output format, validity gate, canonical
|
| 1130 |
+
pose, and a local self-check), see
|
| 1131 |
+
[`docs/benchmark/submission.md`]({SUBMISSION_DOC_URL}).
|
| 1132 |
"""
|
| 1133 |
)
|
| 1134 |
# OAuth gate. The user must log in via the HF button before
|
gallery.py
CHANGED
|
@@ -115,7 +115,7 @@ def _sub_payload(row: dict, fixture_ids: list[str], render_resolver, diff_resolv
|
|
| 115 |
grid never changes the modal.
|
| 116 |
"""
|
| 117 |
by_task = row.get("score_by_task_type") or {}
|
| 118 |
-
pfs = row.get("
|
| 119 |
sid = row.get("submission_id") or ""
|
| 120 |
cells: dict[str, dict] = {}
|
| 121 |
for fid in fixture_ids:
|
|
|
|
| 115 |
grid never changes the modal.
|
| 116 |
"""
|
| 117 |
by_task = row.get("score_by_task_type") or {}
|
| 118 |
+
pfs = row.get("per_sample_scores") or {}
|
| 119 |
sid = row.get("submission_id") or ""
|
| 120 |
cells: dict[str, dict] = {}
|
| 121 |
for fid in fixture_ids:
|
submit.py
CHANGED
|
@@ -827,8 +827,8 @@ def _build_pending_row(
|
|
| 827 |
"aggregate_score": None,
|
| 828 |
"score_by_task_type": None,
|
| 829 |
"per_task_scores": None,
|
| 830 |
-
"
|
| 831 |
-
"
|
| 832 |
"submission_blob_url": blob_url,
|
| 833 |
"submission_sha256": submission_sha256,
|
| 834 |
"validation_status": "unvalidated",
|
|
@@ -1629,7 +1629,7 @@ def _build_report_json(run_dir: Path) -> dict[str, Any]:
|
|
| 1629 |
per_fixture[fixture_dir.name] = json.loads(
|
| 1630 |
rp.read_text(encoding="utf-8")
|
| 1631 |
)
|
| 1632 |
-
return {"run_summary": summary, "
|
| 1633 |
|
| 1634 |
|
| 1635 |
def _publish_reports_and_gallery(
|
|
@@ -1725,7 +1725,7 @@ def _flip_row_to_completed(submission_id: str, summary: dict[str, Any]) -> None:
|
|
| 1725 |
"validity_rate": summary.get("validity_rate"),
|
| 1726 |
"score_by_task_type": summary.get("score_by_task_type"),
|
| 1727 |
"per_task_scores": summary.get("per_task_scores"),
|
| 1728 |
-
"
|
| 1729 |
}
|
| 1730 |
_update_row(submission_id, updates)
|
| 1731 |
|
|
|
|
| 827 |
"aggregate_score": None,
|
| 828 |
"score_by_task_type": None,
|
| 829 |
"per_task_scores": None,
|
| 830 |
+
"per_sample_scores": None,
|
| 831 |
+
"per_sample_breakdown": None,
|
| 832 |
"submission_blob_url": blob_url,
|
| 833 |
"submission_sha256": submission_sha256,
|
| 834 |
"validation_status": "unvalidated",
|
|
|
|
| 1629 |
per_fixture[fixture_dir.name] = json.loads(
|
| 1630 |
rp.read_text(encoding="utf-8")
|
| 1631 |
)
|
| 1632 |
+
return {"run_summary": summary, "per_sample_results": per_fixture}
|
| 1633 |
|
| 1634 |
|
| 1635 |
def _publish_reports_and_gallery(
|
|
|
|
| 1725 |
"validity_rate": summary.get("validity_rate"),
|
| 1726 |
"score_by_task_type": summary.get("score_by_task_type"),
|
| 1727 |
"per_task_scores": summary.get("per_task_scores"),
|
| 1728 |
+
"per_sample_scores": summary.get("per_sample_scores"),
|
| 1729 |
}
|
| 1730 |
_update_row(submission_id, updates)
|
| 1731 |
|
tests/test_admin.py
CHANGED
|
@@ -312,8 +312,8 @@ RESCORE_ROWS = [
|
|
| 312 |
"validity_rate": 1.0,
|
| 313 |
"score_by_task_type": {"generation": 0.7},
|
| 314 |
"per_task_scores": {"generation": {"score": 0.7}},
|
| 315 |
-
"
|
| 316 |
-
"
|
| 317 |
},
|
| 318 |
{
|
| 319 |
"submission_id": "broke",
|
|
|
|
| 312 |
"validity_rate": 1.0,
|
| 313 |
"score_by_task_type": {"generation": 0.7},
|
| 314 |
"per_task_scores": {"generation": {"score": 0.7}},
|
| 315 |
+
"per_sample_scores": {"f1": {"cad_score": 0.7}},
|
| 316 |
+
"per_sample_breakdown": {"f1": {"validity": 1.0}},
|
| 317 |
},
|
| 318 |
{
|
| 319 |
"submission_id": "broke",
|
tests/test_submit.py
CHANGED
|
@@ -386,8 +386,8 @@ def test_pending_row_preserves_existing_metadata(monkeypatch):
|
|
| 386 |
"validity_rate",
|
| 387 |
"score_by_task_type",
|
| 388 |
"per_task_scores",
|
| 389 |
-
"
|
| 390 |
-
"
|
| 391 |
):
|
| 392 |
assert row[k] is None
|
| 393 |
|
|
|
|
| 386 |
"validity_rate",
|
| 387 |
"score_by_task_type",
|
| 388 |
"per_task_scores",
|
| 389 |
+
"per_sample_scores",
|
| 390 |
+
"per_sample_breakdown",
|
| 391 |
):
|
| 392 |
assert row[k] is None
|
| 393 |
|
tools/pick_gallery_fixtures.py
CHANGED
|
@@ -95,7 +95,7 @@ def main() -> int:
|
|
| 95 |
print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")
|
| 96 |
|
| 97 |
by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
|
| 98 |
-
for fid, fx in (ref.get("
|
| 99 |
fx = fx or {}
|
| 100 |
if fx.get("status") == "valid" and fx.get("cad_score") is not None:
|
| 101 |
by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))
|
|
|
|
| 95 |
print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")
|
| 96 |
|
| 97 |
by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
|
| 98 |
+
for fid, fx in (ref.get("per_sample_scores") or {}).items():
|
| 99 |
fx = fx or {}
|
| 100 |
if fx.get("status") == "valid" and fx.get("cad_score") is not None:
|
| 101 |
by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))
|