Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

Michael Rabinovich Cursor commited on 11 days ago

Commit

952dbca

1 Parent(s): afed315

schema: rename model -> submission_name; seed with 3 baseline rows

Mirrors the cadgenbench-submissions schema change. LEADERBOARD_COLS
exposes submission_name as the lead identity column instead of model;
the handle_submit signature, the Submit-tab form Textbox, and the
meta.json example in the markdown all match.

Submit-tab markdown gets a short "Submission name" explainer making
explicit that the benchmark is system-agnostic - a submission may use
no LLM, one, or several. If the submitter wants to disclose their
stack they put it in submission_name or notes.

Local results.jsonl mirrors the 3 baseline seed rows from
cadgenbench-submissions so the offline fallback shows real data, not
the old fake demos.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (2) hide show

app.py +15 -10
results.jsonl +3 -2

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ HF_DATA_REPO = os.getenv("HF_DATA_REPO", f"{HF_ORG}/cadgenbench-data")
 LOCAL_RESULTS_PATH = Path(__file__).parent / "results.jsonl"
 LEADERBOARD_COLS = [
-    "model",
     "submitter_name",
     "aggregate_score",
     "validity_rate",
@@ -85,7 +85,7 @@ def load_leaderboard() -> pd.DataFrame:
 def handle_submit(
     zip_file,
-    model: str,
     submitter: str,
     agent_url: str,
     notes: str,
@@ -93,8 +93,8 @@ def handle_submit(
 ) -> str:
     if zip_file is None:
         return "**Error:** please attach a submission zip."
-    if not model.strip():
-        return "**Error:** please fill in the Model identifier."
     if not submitter.strip():
         return "**Error:** please fill in your Submitter name."
     if not agree:
@@ -102,7 +102,7 @@ def handle_submit(
     name = Path(zip_file.name).name
     return (
-        f"Received `{name}` for model `{model}` by `{submitter}`.\n\n"
         f"_Evaluation is not wired yet (Step 6 of the build plan). Once it "
         f"is, this submission will run the CPU eval inline and append a row "
         f"to `{HF_SUBMISSIONS_REPO}`._"
@@ -155,13 +155,18 @@ with gr.Blocks(title="CADGenBench Leaderboard") as app:
 ```json
 {{
   "submitter_name": "your name or team",
-  "model": "anthropic/claude-sonnet-4-6",
   "agent_url": "https://github.com/...   (optional)",
   "notes": "free text, optional, max 500 chars, single line, plain text",
   "agree_to_publish": true
 }}
 ```
 **Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
 and stripped to a single line. Shown in the per-submission detail view,
 not in the main leaderboard table.
@@ -173,9 +178,9 @@ The Space runs the CPU eval inline and appends a row to
         )
         zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
         with gr.Row():
-            model_in = gr.Textbox(
-                label="Model identifier",
-                placeholder="e.g. anthropic/claude-sonnet-4-6",
             )
             submitter_in = gr.Textbox(label="Submitter name")
         with gr.Row():
@@ -193,7 +198,7 @@ The Space runs the CPU eval inline and appends a row to
             fn=handle_submit,
             inputs=[
                 zip_in,
-                model_in,
                 submitter_in,
                 agent_url_in,
                 notes_in,

 LOCAL_RESULTS_PATH = Path(__file__).parent / "results.jsonl"
 LEADERBOARD_COLS = [
+    "submission_name",
     "submitter_name",
     "aggregate_score",
     "validity_rate",
 def handle_submit(
     zip_file,
+    submission_name: str,
     submitter: str,
     agent_url: str,
     notes: str,
 ) -> str:
     if zip_file is None:
         return "**Error:** please attach a submission zip."
+    if not submission_name.strip():
+        return "**Error:** please fill in the Submission name."
     if not submitter.strip():
         return "**Error:** please fill in your Submitter name."
     if not agree:
     name = Path(zip_file.name).name
     return (
+        f"Received `{name}` - submission `{submission_name}` by `{submitter}`.\n\n"
         f"_Evaluation is not wired yet (Step 6 of the build plan). Once it "
         f"is, this submission will run the CPU eval inline and append a row "
         f"to `{HF_SUBMISSIONS_REPO}`._"
 ```json
 {{
   "submitter_name": "your name or team",
+  "submission_name": "MyAgent v2.3 (or whatever describes your system)",
   "agent_url": "https://github.com/...   (optional)",
   "notes": "free text, optional, max 500 chars, single line, plain text",
   "agree_to_publish": true
 }}
 ```
+**Submission name.** Free text describing the system being benchmarked,
+however you choose to describe it. The benchmark is system-agnostic - your
+submission may use no LLM, one, or many. If you want to disclose your
+stack, put it here or in `notes`.
 **Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
 and stripped to a single line. Shown in the per-submission detail view,
 not in the main leaderboard table.
         )
         zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
         with gr.Row():
+            submission_name_in = gr.Textbox(
+                label="Submission name",
+                placeholder='e.g. "MyAgent v2.3" or "build123d baseline (Claude Opus 4.7)"',
             )
             submitter_in = gr.Textbox(label="Submitter name")
         with gr.Row():
             fn=handle_submit,
             inputs=[
                 zip_in,
+                submission_name_in,
                 submitter_in,
                 agent_url_in,
                 notes_in,

results.jsonl CHANGED Viewed

@@ -1,2 +1,3 @@
-{"submission_id": "demo-001", "submitter_name": "Reference (dev seed)", "model": "anthropic/claude-sonnet-4-6", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "seed row for UI dev", "submitted_at": "2026-05-26T08:00:00Z", "cadgenbench_version": "0.0.0-dev", "cadgenbench_data_revision": "stub", "validity_rate": 1.0, "aggregate_score": 0.42, "per_fixture_scores": {"jig-01-single-hole-plate": 0.85, "jig-02-4hole-pattern-plate": 0.31}, "submission_blob_url": null}
-{"submission_id": "demo-002", "submitter_name": "Reference (dev seed)", "model": "openai/gpt-5.3", "agent_url": null, "notes": "second seed row", "submitted_at": "2026-05-26T08:30:00Z", "cadgenbench_version": "0.0.0-dev", "cadgenbench_data_revision": "stub", "validity_rate": 0.5, "aggregate_score": 0.18, "per_fixture_scores": {"jig-01-single-hole-plate": 0.36, "jig-02-4hole-pattern-plate": 0.0}, "submission_blob_url": null}

+{"submission_id": "HF_build123d_baseline_claude-opus-4-7", "submission_name": "HF build123d baseline (Claude Opus 4.7)", "submitter_name": "michaelr27", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "", "submitted_at": "2026-05-26T12:02:31Z", "cadgenbench_version": "0.1.0", "cadgenbench_data_revision": "f4c58085b5eb", "aggregate_score": 0.6597, "validity_rate": 1.0, "score_by_task_type": {"generation": 0.6114, "editing": 0.9979}, "per_task_scores": {"editing": {"score": 0.9979, "validity_rate": 1.0, "n_fixtures": 1, "n_valid": 1, "n_invalid": 0, "n_missing": 0}, "generation": {"score": 0.6114, "validity_rate": 1.0, "n_fixtures": 7, "n_valid": 7, "n_invalid": 0, "n_missing": 0}}, "per_fixture_scores": {"jig-01-edit-double-hole": {"status": "valid", "cad_score": 0.9979, "task_type": "editing"}, "jig-01-single-hole-plate": {"status": "valid", "cad_score": 0.9984, "task_type": "generation"}, "jig-02-4hole-pattern-plate": {"status": "valid", "cad_score": 0.7688, "task_type": "generation"}, "jig-03-l-bracket-w-hex": {"status": "valid", "cad_score": 0.6047, "task_type": "generation"}, "jig-04-slot-and-2-holes-plate": {"status": "valid", "cad_score": 0.6758, "task_type": "generation"}, "nist-ctc-01": {"status": "valid", "cad_score": 0.4268, "task_type": "generation"}, "nist-ctc-03": {"status": "valid", "cad_score": 0.348, "task_type": "generation"}, "nist-ctc-05": {"status": "valid", "cad_score": 0.4571, "task_type": "generation"}}, "submission_blob_url": "https://huggingface.co/datasets/michaelr27/cadgenbench-submissions/resolve/main/submissions/HF_build123d_baseline_claude-opus-4-7.zip"}
+{"submission_id": "HF_build123d_baseline_gemini-3.1-pro-preview", "submission_name": "HF build123d baseline (Gemini 3.1 Pro Preview)", "submitter_name": "michaelr27", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "", "submitted_at": "2026-05-26T12:02:31Z", "cadgenbench_version": "0.1.0", "cadgenbench_data_revision": "f4c58085b5eb", "aggregate_score": 0.7267, "validity_rate": 1.0, "score_by_task_type": {"generation": 0.6879, "editing": 0.9982}, "per_task_scores": {"editing": {"score": 0.9982, "validity_rate": 1.0, "n_fixtures": 1, "n_valid": 1, "n_invalid": 0, "n_missing": 0}, "generation": {"score": 0.6879, "validity_rate": 1.0, "n_fixtures": 7, "n_valid": 7, "n_invalid": 0, "n_missing": 0}}, "per_fixture_scores": {"jig-01-edit-double-hole": {"status": "valid", "cad_score": 0.9982, "task_type": "editing"}, "jig-01-single-hole-plate": {"status": "valid", "cad_score": 0.9932, "task_type": "generation"}, "jig-02-4hole-pattern-plate": {"status": "valid", "cad_score": 0.8743, "task_type": "generation"}, "jig-03-l-bracket-w-hex": {"status": "valid", "cad_score": 0.579, "task_type": "generation"}, "jig-04-slot-and-2-holes-plate": {"status": "valid", "cad_score": 0.821, "task_type": "generation"}, "nist-ctc-01": {"status": "valid", "cad_score": 0.6155, "task_type": "generation"}, "nist-ctc-03": {"status": "valid", "cad_score": 0.4289, "task_type": "generation"}, "nist-ctc-05": {"status": "valid", "cad_score": 0.5031, "task_type": "generation"}}, "submission_blob_url": "https://huggingface.co/datasets/michaelr27/cadgenbench-submissions/resolve/main/submissions/HF_build123d_baseline_gemini-3.1-pro-preview.zip"}
+{"submission_id": "HF_build123d_baseline_gpt-5.5", "submission_name": "HF build123d baseline (GPT-5.5)", "submitter_name": "michaelr27", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "", "submitted_at": "2026-05-26T12:02:31Z", "cadgenbench_version": "0.1.0", "cadgenbench_data_revision": "f4c58085b5eb", "aggregate_score": 0.6805, "validity_rate": 1.0, "score_by_task_type": {"generation": 0.6351, "editing": 0.9982}, "per_task_scores": {"editing": {"score": 0.9982, "validity_rate": 1.0, "n_fixtures": 1, "n_valid": 1, "n_invalid": 0, "n_missing": 0}, "generation": {"score": 0.6351, "validity_rate": 1.0, "n_fixtures": 7, "n_valid": 7, "n_invalid": 0, "n_missing": 0}}, "per_fixture_scores": {"jig-01-edit-double-hole": {"status": "valid", "cad_score": 0.9982, "task_type": "editing"}, "jig-01-single-hole-plate": {"status": "valid", "cad_score": 0.9996, "task_type": "generation"}, "jig-02-4hole-pattern-plate": {"status": "valid", "cad_score": 0.717, "task_type": "generation"}, "jig-03-l-bracket-w-hex": {"status": "valid", "cad_score": 0.578, "task_type": "generation"}, "jig-04-slot-and-2-holes-plate": {"status": "valid", "cad_score": 0.8948, "task_type": "generation"}, "nist-ctc-01": {"status": "valid", "cad_score": 0.478, "task_type": "generation"}, "nist-ctc-03": {"status": "valid", "cad_score": 0.3619, "task_type": "generation"}, "nist-ctc-05": {"status": "valid", "cad_score": 0.4166, "task_type": "generation"}}, "submission_blob_url": "https://huggingface.co/datasets/michaelr27/cadgenbench-submissions/resolve/main/submissions/HF_build123d_baseline_gpt-5.5.zip"}