Michael Rabinovich Cursor commited on
Commit ·
952dbca
1
Parent(s): afed315
schema: rename model -> submission_name; seed with 3 baseline rows
Browse filesMirrors the cadgenbench-submissions schema change. LEADERBOARD_COLS
exposes submission_name as the lead identity column instead of model;
the handle_submit signature, the Submit-tab form Textbox, and the
meta.json example in the markdown all match.
Submit-tab markdown gets a short "Submission name" explainer making
explicit that the benchmark is system-agnostic - a submission may use
no LLM, one, or several. If the submitter wants to disclose their
stack they put it in submission_name or notes.
Local results.jsonl mirrors the 3 baseline seed rows from
cadgenbench-submissions so the offline fallback shows real data, not
the old fake demos.
Co-authored-by: Cursor <cursoragent@cursor.com>
- app.py +15 -10
- results.jsonl +3 -2
app.py
CHANGED
|
@@ -26,7 +26,7 @@ HF_DATA_REPO = os.getenv("HF_DATA_REPO", f"{HF_ORG}/cadgenbench-data")
|
|
| 26 |
LOCAL_RESULTS_PATH = Path(__file__).parent / "results.jsonl"
|
| 27 |
|
| 28 |
LEADERBOARD_COLS = [
|
| 29 |
-
"
|
| 30 |
"submitter_name",
|
| 31 |
"aggregate_score",
|
| 32 |
"validity_rate",
|
|
@@ -85,7 +85,7 @@ def load_leaderboard() -> pd.DataFrame:
|
|
| 85 |
|
| 86 |
def handle_submit(
|
| 87 |
zip_file,
|
| 88 |
-
|
| 89 |
submitter: str,
|
| 90 |
agent_url: str,
|
| 91 |
notes: str,
|
|
@@ -93,8 +93,8 @@ def handle_submit(
|
|
| 93 |
) -> str:
|
| 94 |
if zip_file is None:
|
| 95 |
return "**Error:** please attach a submission zip."
|
| 96 |
-
if not
|
| 97 |
-
return "**Error:** please fill in the
|
| 98 |
if not submitter.strip():
|
| 99 |
return "**Error:** please fill in your Submitter name."
|
| 100 |
if not agree:
|
|
@@ -102,7 +102,7 @@ def handle_submit(
|
|
| 102 |
|
| 103 |
name = Path(zip_file.name).name
|
| 104 |
return (
|
| 105 |
-
f"Received `{name}`
|
| 106 |
f"_Evaluation is not wired yet (Step 6 of the build plan). Once it "
|
| 107 |
f"is, this submission will run the CPU eval inline and append a row "
|
| 108 |
f"to `{HF_SUBMISSIONS_REPO}`._"
|
|
@@ -155,13 +155,18 @@ with gr.Blocks(title="CADGenBench Leaderboard") as app:
|
|
| 155 |
```json
|
| 156 |
{{
|
| 157 |
"submitter_name": "your name or team",
|
| 158 |
-
"
|
| 159 |
"agent_url": "https://github.com/... (optional)",
|
| 160 |
"notes": "free text, optional, max 500 chars, single line, plain text",
|
| 161 |
"agree_to_publish": true
|
| 162 |
}}
|
| 163 |
```
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
**Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
|
| 166 |
and stripped to a single line. Shown in the per-submission detail view,
|
| 167 |
not in the main leaderboard table.
|
|
@@ -173,9 +178,9 @@ The Space runs the CPU eval inline and appends a row to
|
|
| 173 |
)
|
| 174 |
zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
|
| 175 |
with gr.Row():
|
| 176 |
-
|
| 177 |
-
label="
|
| 178 |
-
placeholder=
|
| 179 |
)
|
| 180 |
submitter_in = gr.Textbox(label="Submitter name")
|
| 181 |
with gr.Row():
|
|
@@ -193,7 +198,7 @@ The Space runs the CPU eval inline and appends a row to
|
|
| 193 |
fn=handle_submit,
|
| 194 |
inputs=[
|
| 195 |
zip_in,
|
| 196 |
-
|
| 197 |
submitter_in,
|
| 198 |
agent_url_in,
|
| 199 |
notes_in,
|
|
|
|
| 26 |
LOCAL_RESULTS_PATH = Path(__file__).parent / "results.jsonl"
|
| 27 |
|
| 28 |
LEADERBOARD_COLS = [
|
| 29 |
+
"submission_name",
|
| 30 |
"submitter_name",
|
| 31 |
"aggregate_score",
|
| 32 |
"validity_rate",
|
|
|
|
| 85 |
|
| 86 |
def handle_submit(
|
| 87 |
zip_file,
|
| 88 |
+
submission_name: str,
|
| 89 |
submitter: str,
|
| 90 |
agent_url: str,
|
| 91 |
notes: str,
|
|
|
|
| 93 |
) -> str:
|
| 94 |
if zip_file is None:
|
| 95 |
return "**Error:** please attach a submission zip."
|
| 96 |
+
if not submission_name.strip():
|
| 97 |
+
return "**Error:** please fill in the Submission name."
|
| 98 |
if not submitter.strip():
|
| 99 |
return "**Error:** please fill in your Submitter name."
|
| 100 |
if not agree:
|
|
|
|
| 102 |
|
| 103 |
name = Path(zip_file.name).name
|
| 104 |
return (
|
| 105 |
+
f"Received `{name}` - submission `{submission_name}` by `{submitter}`.\n\n"
|
| 106 |
f"_Evaluation is not wired yet (Step 6 of the build plan). Once it "
|
| 107 |
f"is, this submission will run the CPU eval inline and append a row "
|
| 108 |
f"to `{HF_SUBMISSIONS_REPO}`._"
|
|
|
|
| 155 |
```json
|
| 156 |
{{
|
| 157 |
"submitter_name": "your name or team",
|
| 158 |
+
"submission_name": "MyAgent v2.3 (or whatever describes your system)",
|
| 159 |
"agent_url": "https://github.com/... (optional)",
|
| 160 |
"notes": "free text, optional, max 500 chars, single line, plain text",
|
| 161 |
"agree_to_publish": true
|
| 162 |
}}
|
| 163 |
```
|
| 164 |
|
| 165 |
+
**Submission name.** Free text describing the system being benchmarked,
|
| 166 |
+
however you choose to describe it. The benchmark is system-agnostic - your
|
| 167 |
+
submission may use no LLM, one, or many. If you want to disclose your
|
| 168 |
+
stack, put it here or in `notes`.
|
| 169 |
+
|
| 170 |
**Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
|
| 171 |
and stripped to a single line. Shown in the per-submission detail view,
|
| 172 |
not in the main leaderboard table.
|
|
|
|
| 178 |
)
|
| 179 |
zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
|
| 180 |
with gr.Row():
|
| 181 |
+
submission_name_in = gr.Textbox(
|
| 182 |
+
label="Submission name",
|
| 183 |
+
placeholder='e.g. "MyAgent v2.3" or "build123d baseline (Claude Opus 4.7)"',
|
| 184 |
)
|
| 185 |
submitter_in = gr.Textbox(label="Submitter name")
|
| 186 |
with gr.Row():
|
|
|
|
| 198 |
fn=handle_submit,
|
| 199 |
inputs=[
|
| 200 |
zip_in,
|
| 201 |
+
submission_name_in,
|
| 202 |
submitter_in,
|
| 203 |
agent_url_in,
|
| 204 |
notes_in,
|
results.jsonl
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
-
{"submission_id": "
|
| 2 |
-
{"submission_id": "
|
|
|
|
|
|
| 1 |
+
{"submission_id": "HF_build123d_baseline_claude-opus-4-7", "submission_name": "HF build123d baseline (Claude Opus 4.7)", "submitter_name": "michaelr27", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "", "submitted_at": "2026-05-26T12:02:31Z", "cadgenbench_version": "0.1.0", "cadgenbench_data_revision": "f4c58085b5eb", "aggregate_score": 0.6597, "validity_rate": 1.0, "score_by_task_type": {"generation": 0.6114, "editing": 0.9979}, "per_task_scores": {"editing": {"score": 0.9979, "validity_rate": 1.0, "n_fixtures": 1, "n_valid": 1, "n_invalid": 0, "n_missing": 0}, "generation": {"score": 0.6114, "validity_rate": 1.0, "n_fixtures": 7, "n_valid": 7, "n_invalid": 0, "n_missing": 0}}, "per_fixture_scores": {"jig-01-edit-double-hole": {"status": "valid", "cad_score": 0.9979, "task_type": "editing"}, "jig-01-single-hole-plate": {"status": "valid", "cad_score": 0.9984, "task_type": "generation"}, "jig-02-4hole-pattern-plate": {"status": "valid", "cad_score": 0.7688, "task_type": "generation"}, "jig-03-l-bracket-w-hex": {"status": "valid", "cad_score": 0.6047, "task_type": "generation"}, "jig-04-slot-and-2-holes-plate": {"status": "valid", "cad_score": 0.6758, "task_type": "generation"}, "nist-ctc-01": {"status": "valid", "cad_score": 0.4268, "task_type": "generation"}, "nist-ctc-03": {"status": "valid", "cad_score": 0.348, "task_type": "generation"}, "nist-ctc-05": {"status": "valid", "cad_score": 0.4571, "task_type": "generation"}}, "submission_blob_url": "https://huggingface.co/datasets/michaelr27/cadgenbench-submissions/resolve/main/submissions/HF_build123d_baseline_claude-opus-4-7.zip"}
|
| 2 |
+
{"submission_id": "HF_build123d_baseline_gemini-3.1-pro-preview", "submission_name": "HF build123d baseline (Gemini 3.1 Pro Preview)", "submitter_name": "michaelr27", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "", "submitted_at": "2026-05-26T12:02:31Z", "cadgenbench_version": "0.1.0", "cadgenbench_data_revision": "f4c58085b5eb", "aggregate_score": 0.7267, "validity_rate": 1.0, "score_by_task_type": {"generation": 0.6879, "editing": 0.9982}, "per_task_scores": {"editing": {"score": 0.9982, "validity_rate": 1.0, "n_fixtures": 1, "n_valid": 1, "n_invalid": 0, "n_missing": 0}, "generation": {"score": 0.6879, "validity_rate": 1.0, "n_fixtures": 7, "n_valid": 7, "n_invalid": 0, "n_missing": 0}}, "per_fixture_scores": {"jig-01-edit-double-hole": {"status": "valid", "cad_score": 0.9982, "task_type": "editing"}, "jig-01-single-hole-plate": {"status": "valid", "cad_score": 0.9932, "task_type": "generation"}, "jig-02-4hole-pattern-plate": {"status": "valid", "cad_score": 0.8743, "task_type": "generation"}, "jig-03-l-bracket-w-hex": {"status": "valid", "cad_score": 0.579, "task_type": "generation"}, "jig-04-slot-and-2-holes-plate": {"status": "valid", "cad_score": 0.821, "task_type": "generation"}, "nist-ctc-01": {"status": "valid", "cad_score": 0.6155, "task_type": "generation"}, "nist-ctc-03": {"status": "valid", "cad_score": 0.4289, "task_type": "generation"}, "nist-ctc-05": {"status": "valid", "cad_score": 0.5031, "task_type": "generation"}}, "submission_blob_url": "https://huggingface.co/datasets/michaelr27/cadgenbench-submissions/resolve/main/submissions/HF_build123d_baseline_gemini-3.1-pro-preview.zip"}
|
| 3 |
+
{"submission_id": "HF_build123d_baseline_gpt-5.5", "submission_name": "HF build123d baseline (GPT-5.5)", "submitter_name": "michaelr27", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "", "submitted_at": "2026-05-26T12:02:31Z", "cadgenbench_version": "0.1.0", "cadgenbench_data_revision": "f4c58085b5eb", "aggregate_score": 0.6805, "validity_rate": 1.0, "score_by_task_type": {"generation": 0.6351, "editing": 0.9982}, "per_task_scores": {"editing": {"score": 0.9982, "validity_rate": 1.0, "n_fixtures": 1, "n_valid": 1, "n_invalid": 0, "n_missing": 0}, "generation": {"score": 0.6351, "validity_rate": 1.0, "n_fixtures": 7, "n_valid": 7, "n_invalid": 0, "n_missing": 0}}, "per_fixture_scores": {"jig-01-edit-double-hole": {"status": "valid", "cad_score": 0.9982, "task_type": "editing"}, "jig-01-single-hole-plate": {"status": "valid", "cad_score": 0.9996, "task_type": "generation"}, "jig-02-4hole-pattern-plate": {"status": "valid", "cad_score": 0.717, "task_type": "generation"}, "jig-03-l-bracket-w-hex": {"status": "valid", "cad_score": 0.578, "task_type": "generation"}, "jig-04-slot-and-2-holes-plate": {"status": "valid", "cad_score": 0.8948, "task_type": "generation"}, "nist-ctc-01": {"status": "valid", "cad_score": 0.478, "task_type": "generation"}, "nist-ctc-03": {"status": "valid", "cad_score": 0.3619, "task_type": "generation"}, "nist-ctc-05": {"status": "valid", "cad_score": 0.4166, "task_type": "generation"}}, "submission_blob_url": "https://huggingface.co/datasets/michaelr27/cadgenbench-submissions/resolve/main/submissions/HF_build123d_baseline_gpt-5.5.zip"}
|