Michael Rabinovich Cursor commited on
Commit
be6fa3d
·
1 Parent(s): 335cdfa

Rename fixture-named result keys to sample-named keys

Browse files

Read/write per_sample_scores, per_sample_breakdown, n_samples,
per_sample_results across submit/gallery/admin and tools. Also link the
canonical submission docs from the Submit tab.

Co-authored-by: Cursor <cursoragent@cursor.com>

admin.py CHANGED
@@ -78,8 +78,8 @@ _RESCORE_CLEARED_SCORE_FIELDS: tuple[str, ...] = (
78
  "validity_rate",
79
  "score_by_task_type",
80
  "per_task_scores",
81
- "per_fixture_scores",
82
- "per_fixture_breakdown",
83
  )
84
 
85
  # Gap between successive worker dispatches in a bulk rescore. Each
 
78
  "validity_rate",
79
  "score_by_task_type",
80
  "per_task_scores",
81
+ "per_sample_scores",
82
+ "per_sample_breakdown",
83
  )
84
 
85
  # Gap between successive worker dispatches in a bulk rescore. Each
app.py CHANGED
@@ -92,6 +92,13 @@ logging.basicConfig(
92
  VALIDATION_DOC_URL = (
93
  "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
94
  )
 
 
 
 
 
 
 
95
 
96
  ABOUT_MD = f"""## About
97
 
@@ -1118,6 +1125,10 @@ not in the main leaderboard table.
1118
 
1119
  **Consent.** `"agree_to_publish": true` in `meta.json` is your consent
1120
  to publish the resulting row on the public leaderboard.
 
 
 
 
1121
  """
1122
  )
1123
  # OAuth gate. The user must log in via the HF button before
 
92
  VALIDATION_DOC_URL = (
93
  "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
94
  )
95
+ # Canonical submission contract (output layout, validity gate, canonical
96
+ # pose, local self-check). Linked from the Submit tab so the tab itself
97
+ # stays a short "how to package + upload" note rather than re-documenting
98
+ # the full contract.
99
+ SUBMISSION_DOC_URL = (
100
+ "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/submission.md"
101
+ )
102
 
103
  ABOUT_MD = f"""## About
104
 
 
1125
 
1126
  **Consent.** `"agree_to_publish": true` in `meta.json` is your consent
1127
  to publish the resulting row on the public leaderboard.
1128
+
1129
+ For the full submission contract (output format, validity gate, canonical
1130
+ pose, and a local self-check), see
1131
+ [`docs/benchmark/submission.md`]({SUBMISSION_DOC_URL}).
1132
  """
1133
  )
1134
  # OAuth gate. The user must log in via the HF button before
gallery.py CHANGED
@@ -115,7 +115,7 @@ def _sub_payload(row: dict, fixture_ids: list[str], render_resolver, diff_resolv
115
  grid never changes the modal.
116
  """
117
  by_task = row.get("score_by_task_type") or {}
118
- pfs = row.get("per_fixture_scores") or {}
119
  sid = row.get("submission_id") or ""
120
  cells: dict[str, dict] = {}
121
  for fid in fixture_ids:
 
115
  grid never changes the modal.
116
  """
117
  by_task = row.get("score_by_task_type") or {}
118
+ pfs = row.get("per_sample_scores") or {}
119
  sid = row.get("submission_id") or ""
120
  cells: dict[str, dict] = {}
121
  for fid in fixture_ids:
submit.py CHANGED
@@ -827,8 +827,8 @@ def _build_pending_row(
827
  "aggregate_score": None,
828
  "score_by_task_type": None,
829
  "per_task_scores": None,
830
- "per_fixture_scores": None,
831
- "per_fixture_breakdown": None,
832
  "submission_blob_url": blob_url,
833
  "submission_sha256": submission_sha256,
834
  "validation_status": "unvalidated",
@@ -1629,7 +1629,7 @@ def _build_report_json(run_dir: Path) -> dict[str, Any]:
1629
  per_fixture[fixture_dir.name] = json.loads(
1630
  rp.read_text(encoding="utf-8")
1631
  )
1632
- return {"run_summary": summary, "per_fixture_results": per_fixture}
1633
 
1634
 
1635
  def _publish_reports_and_gallery(
@@ -1725,7 +1725,7 @@ def _flip_row_to_completed(submission_id: str, summary: dict[str, Any]) -> None:
1725
  "validity_rate": summary.get("validity_rate"),
1726
  "score_by_task_type": summary.get("score_by_task_type"),
1727
  "per_task_scores": summary.get("per_task_scores"),
1728
- "per_fixture_scores": summary.get("per_fixture_scores"),
1729
  }
1730
  _update_row(submission_id, updates)
1731
 
 
827
  "aggregate_score": None,
828
  "score_by_task_type": None,
829
  "per_task_scores": None,
830
+ "per_sample_scores": None,
831
+ "per_sample_breakdown": None,
832
  "submission_blob_url": blob_url,
833
  "submission_sha256": submission_sha256,
834
  "validation_status": "unvalidated",
 
1629
  per_fixture[fixture_dir.name] = json.loads(
1630
  rp.read_text(encoding="utf-8")
1631
  )
1632
+ return {"run_summary": summary, "per_sample_results": per_fixture}
1633
 
1634
 
1635
  def _publish_reports_and_gallery(
 
1725
  "validity_rate": summary.get("validity_rate"),
1726
  "score_by_task_type": summary.get("score_by_task_type"),
1727
  "per_task_scores": summary.get("per_task_scores"),
1728
+ "per_sample_scores": summary.get("per_sample_scores"),
1729
  }
1730
  _update_row(submission_id, updates)
1731
 
tests/test_admin.py CHANGED
@@ -312,8 +312,8 @@ RESCORE_ROWS = [
312
  "validity_rate": 1.0,
313
  "score_by_task_type": {"generation": 0.7},
314
  "per_task_scores": {"generation": {"score": 0.7}},
315
- "per_fixture_scores": {"f1": {"cad_score": 0.7}},
316
- "per_fixture_breakdown": {"f1": {"validity": 1.0}},
317
  },
318
  {
319
  "submission_id": "broke",
 
312
  "validity_rate": 1.0,
313
  "score_by_task_type": {"generation": 0.7},
314
  "per_task_scores": {"generation": {"score": 0.7}},
315
+ "per_sample_scores": {"f1": {"cad_score": 0.7}},
316
+ "per_sample_breakdown": {"f1": {"validity": 1.0}},
317
  },
318
  {
319
  "submission_id": "broke",
tests/test_submit.py CHANGED
@@ -386,8 +386,8 @@ def test_pending_row_preserves_existing_metadata(monkeypatch):
386
  "validity_rate",
387
  "score_by_task_type",
388
  "per_task_scores",
389
- "per_fixture_scores",
390
- "per_fixture_breakdown",
391
  ):
392
  assert row[k] is None
393
 
 
386
  "validity_rate",
387
  "score_by_task_type",
388
  "per_task_scores",
389
+ "per_sample_scores",
390
+ "per_sample_breakdown",
391
  ):
392
  assert row[k] is None
393
 
tools/pick_gallery_fixtures.py CHANGED
@@ -95,7 +95,7 @@ def main() -> int:
95
  print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")
96
 
97
  by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
98
- for fid, fx in (ref.get("per_fixture_scores") or {}).items():
99
  fx = fx or {}
100
  if fx.get("status") == "valid" and fx.get("cad_score") is not None:
101
  by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))
 
95
  print(f"Reference: {ref.get('submission_name')!r} [{ref.get('submission_id')}]\n")
96
 
97
  by_task: dict[str, list[tuple[float, str]]] = defaultdict(list)
98
+ for fid, fx in (ref.get("per_sample_scores") or {}).items():
99
  fx = fx or {}
100
  if fx.get("status") == "valid" and fx.get("cad_score") is not None:
101
  by_task[fx.get("task_type") or "?"].append((float(fx["cad_score"]), fid))