Michael Rabinovich Cursor commited on
Commit
952dbca
·
1 Parent(s): afed315

schema: rename model -> submission_name; seed with 3 baseline rows

Browse files

Mirrors the cadgenbench-submissions schema change. LEADERBOARD_COLS
exposes submission_name as the lead identity column instead of model;
the handle_submit signature, the Submit-tab form Textbox, and the
meta.json example in the markdown all match.

Submit-tab markdown gets a short "Submission name" explainer making
explicit that the benchmark is system-agnostic - a submission may use
no LLM, one, or several. If the submitter wants to disclose their
stack they put it in submission_name or notes.

Local results.jsonl mirrors the 3 baseline seed rows from
cadgenbench-submissions so the offline fallback shows real data, not
the old fake demos.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (2) hide show
  1. app.py +15 -10
  2. results.jsonl +3 -2
app.py CHANGED
@@ -26,7 +26,7 @@ HF_DATA_REPO = os.getenv("HF_DATA_REPO", f"{HF_ORG}/cadgenbench-data")
26
  LOCAL_RESULTS_PATH = Path(__file__).parent / "results.jsonl"
27
 
28
  LEADERBOARD_COLS = [
29
- "model",
30
  "submitter_name",
31
  "aggregate_score",
32
  "validity_rate",
@@ -85,7 +85,7 @@ def load_leaderboard() -> pd.DataFrame:
85
 
86
  def handle_submit(
87
  zip_file,
88
- model: str,
89
  submitter: str,
90
  agent_url: str,
91
  notes: str,
@@ -93,8 +93,8 @@ def handle_submit(
93
  ) -> str:
94
  if zip_file is None:
95
  return "**Error:** please attach a submission zip."
96
- if not model.strip():
97
- return "**Error:** please fill in the Model identifier."
98
  if not submitter.strip():
99
  return "**Error:** please fill in your Submitter name."
100
  if not agree:
@@ -102,7 +102,7 @@ def handle_submit(
102
 
103
  name = Path(zip_file.name).name
104
  return (
105
- f"Received `{name}` for model `{model}` by `{submitter}`.\n\n"
106
  f"_Evaluation is not wired yet (Step 6 of the build plan). Once it "
107
  f"is, this submission will run the CPU eval inline and append a row "
108
  f"to `{HF_SUBMISSIONS_REPO}`._"
@@ -155,13 +155,18 @@ with gr.Blocks(title="CADGenBench Leaderboard") as app:
155
  ```json
156
  {{
157
  "submitter_name": "your name or team",
158
- "model": "anthropic/claude-sonnet-4-6",
159
  "agent_url": "https://github.com/... (optional)",
160
  "notes": "free text, optional, max 500 chars, single line, plain text",
161
  "agree_to_publish": true
162
  }}
163
  ```
164
 
 
 
 
 
 
165
  **Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
166
  and stripped to a single line. Shown in the per-submission detail view,
167
  not in the main leaderboard table.
@@ -173,9 +178,9 @@ The Space runs the CPU eval inline and appends a row to
173
  )
174
  zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
175
  with gr.Row():
176
- model_in = gr.Textbox(
177
- label="Model identifier",
178
- placeholder="e.g. anthropic/claude-sonnet-4-6",
179
  )
180
  submitter_in = gr.Textbox(label="Submitter name")
181
  with gr.Row():
@@ -193,7 +198,7 @@ The Space runs the CPU eval inline and appends a row to
193
  fn=handle_submit,
194
  inputs=[
195
  zip_in,
196
- model_in,
197
  submitter_in,
198
  agent_url_in,
199
  notes_in,
 
26
  LOCAL_RESULTS_PATH = Path(__file__).parent / "results.jsonl"
27
 
28
  LEADERBOARD_COLS = [
29
+ "submission_name",
30
  "submitter_name",
31
  "aggregate_score",
32
  "validity_rate",
 
85
 
86
  def handle_submit(
87
  zip_file,
88
+ submission_name: str,
89
  submitter: str,
90
  agent_url: str,
91
  notes: str,
 
93
  ) -> str:
94
  if zip_file is None:
95
  return "**Error:** please attach a submission zip."
96
+ if not submission_name.strip():
97
+ return "**Error:** please fill in the Submission name."
98
  if not submitter.strip():
99
  return "**Error:** please fill in your Submitter name."
100
  if not agree:
 
102
 
103
  name = Path(zip_file.name).name
104
  return (
105
+ f"Received `{name}` - submission `{submission_name}` by `{submitter}`.\n\n"
106
  f"_Evaluation is not wired yet (Step 6 of the build plan). Once it "
107
  f"is, this submission will run the CPU eval inline and append a row "
108
  f"to `{HF_SUBMISSIONS_REPO}`._"
 
155
  ```json
156
  {{
157
  "submitter_name": "your name or team",
158
+ "submission_name": "MyAgent v2.3 (or whatever describes your system)",
159
  "agent_url": "https://github.com/... (optional)",
160
  "notes": "free text, optional, max 500 chars, single line, plain text",
161
  "agree_to_publish": true
162
  }}
163
  ```
164
 
165
+ **Submission name.** Free text describing the system being benchmarked,
166
+ however you choose to describe it. The benchmark is system-agnostic - your
167
+ submission may use no LLM, one, or many. If you want to disclose your
168
+ stack, put it here or in `notes`.
169
+
170
  **Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
171
  and stripped to a single line. Shown in the per-submission detail view,
172
  not in the main leaderboard table.
 
178
  )
179
  zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
180
  with gr.Row():
181
+ submission_name_in = gr.Textbox(
182
+ label="Submission name",
183
+ placeholder='e.g. "MyAgent v2.3" or "build123d baseline (Claude Opus 4.7)"',
184
  )
185
  submitter_in = gr.Textbox(label="Submitter name")
186
  with gr.Row():
 
198
  fn=handle_submit,
199
  inputs=[
200
  zip_in,
201
+ submission_name_in,
202
  submitter_in,
203
  agent_url_in,
204
  notes_in,
results.jsonl CHANGED
@@ -1,2 +1,3 @@
1
- {"submission_id": "demo-001", "submitter_name": "Reference (dev seed)", "model": "anthropic/claude-sonnet-4-6", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "seed row for UI dev", "submitted_at": "2026-05-26T08:00:00Z", "cadgenbench_version": "0.0.0-dev", "cadgenbench_data_revision": "stub", "validity_rate": 1.0, "aggregate_score": 0.42, "per_fixture_scores": {"jig-01-single-hole-plate": 0.85, "jig-02-4hole-pattern-plate": 0.31}, "submission_blob_url": null}
2
- {"submission_id": "demo-002", "submitter_name": "Reference (dev seed)", "model": "openai/gpt-5.3", "agent_url": null, "notes": "second seed row", "submitted_at": "2026-05-26T08:30:00Z", "cadgenbench_version": "0.0.0-dev", "cadgenbench_data_revision": "stub", "validity_rate": 0.5, "aggregate_score": 0.18, "per_fixture_scores": {"jig-01-single-hole-plate": 0.36, "jig-02-4hole-pattern-plate": 0.0}, "submission_blob_url": null}
 
 
1
+ {"submission_id": "HF_build123d_baseline_claude-opus-4-7", "submission_name": "HF build123d baseline (Claude Opus 4.7)", "submitter_name": "michaelr27", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "", "submitted_at": "2026-05-26T12:02:31Z", "cadgenbench_version": "0.1.0", "cadgenbench_data_revision": "f4c58085b5eb", "aggregate_score": 0.6597, "validity_rate": 1.0, "score_by_task_type": {"generation": 0.6114, "editing": 0.9979}, "per_task_scores": {"editing": {"score": 0.9979, "validity_rate": 1.0, "n_fixtures": 1, "n_valid": 1, "n_invalid": 0, "n_missing": 0}, "generation": {"score": 0.6114, "validity_rate": 1.0, "n_fixtures": 7, "n_valid": 7, "n_invalid": 0, "n_missing": 0}}, "per_fixture_scores": {"jig-01-edit-double-hole": {"status": "valid", "cad_score": 0.9979, "task_type": "editing"}, "jig-01-single-hole-plate": {"status": "valid", "cad_score": 0.9984, "task_type": "generation"}, "jig-02-4hole-pattern-plate": {"status": "valid", "cad_score": 0.7688, "task_type": "generation"}, "jig-03-l-bracket-w-hex": {"status": "valid", "cad_score": 0.6047, "task_type": "generation"}, "jig-04-slot-and-2-holes-plate": {"status": "valid", "cad_score": 0.6758, "task_type": "generation"}, "nist-ctc-01": {"status": "valid", "cad_score": 0.4268, "task_type": "generation"}, "nist-ctc-03": {"status": "valid", "cad_score": 0.348, "task_type": "generation"}, "nist-ctc-05": {"status": "valid", "cad_score": 0.4571, "task_type": "generation"}}, "submission_blob_url": "https://huggingface.co/datasets/michaelr27/cadgenbench-submissions/resolve/main/submissions/HF_build123d_baseline_claude-opus-4-7.zip"}
2
+ {"submission_id": "HF_build123d_baseline_gemini-3.1-pro-preview", "submission_name": "HF build123d baseline (Gemini 3.1 Pro Preview)", "submitter_name": "michaelr27", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "", "submitted_at": "2026-05-26T12:02:31Z", "cadgenbench_version": "0.1.0", "cadgenbench_data_revision": "f4c58085b5eb", "aggregate_score": 0.7267, "validity_rate": 1.0, "score_by_task_type": {"generation": 0.6879, "editing": 0.9982}, "per_task_scores": {"editing": {"score": 0.9982, "validity_rate": 1.0, "n_fixtures": 1, "n_valid": 1, "n_invalid": 0, "n_missing": 0}, "generation": {"score": 0.6879, "validity_rate": 1.0, "n_fixtures": 7, "n_valid": 7, "n_invalid": 0, "n_missing": 0}}, "per_fixture_scores": {"jig-01-edit-double-hole": {"status": "valid", "cad_score": 0.9982, "task_type": "editing"}, "jig-01-single-hole-plate": {"status": "valid", "cad_score": 0.9932, "task_type": "generation"}, "jig-02-4hole-pattern-plate": {"status": "valid", "cad_score": 0.8743, "task_type": "generation"}, "jig-03-l-bracket-w-hex": {"status": "valid", "cad_score": 0.579, "task_type": "generation"}, "jig-04-slot-and-2-holes-plate": {"status": "valid", "cad_score": 0.821, "task_type": "generation"}, "nist-ctc-01": {"status": "valid", "cad_score": 0.6155, "task_type": "generation"}, "nist-ctc-03": {"status": "valid", "cad_score": 0.4289, "task_type": "generation"}, "nist-ctc-05": {"status": "valid", "cad_score": 0.5031, "task_type": "generation"}}, "submission_blob_url": "https://huggingface.co/datasets/michaelr27/cadgenbench-submissions/resolve/main/submissions/HF_build123d_baseline_gemini-3.1-pro-preview.zip"}
3
+ {"submission_id": "HF_build123d_baseline_gpt-5.5", "submission_name": "HF build123d baseline (GPT-5.5)", "submitter_name": "michaelr27", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "", "submitted_at": "2026-05-26T12:02:31Z", "cadgenbench_version": "0.1.0", "cadgenbench_data_revision": "f4c58085b5eb", "aggregate_score": 0.6805, "validity_rate": 1.0, "score_by_task_type": {"generation": 0.6351, "editing": 0.9982}, "per_task_scores": {"editing": {"score": 0.9982, "validity_rate": 1.0, "n_fixtures": 1, "n_valid": 1, "n_invalid": 0, "n_missing": 0}, "generation": {"score": 0.6351, "validity_rate": 1.0, "n_fixtures": 7, "n_valid": 7, "n_invalid": 0, "n_missing": 0}}, "per_fixture_scores": {"jig-01-edit-double-hole": {"status": "valid", "cad_score": 0.9982, "task_type": "editing"}, "jig-01-single-hole-plate": {"status": "valid", "cad_score": 0.9996, "task_type": "generation"}, "jig-02-4hole-pattern-plate": {"status": "valid", "cad_score": 0.717, "task_type": "generation"}, "jig-03-l-bracket-w-hex": {"status": "valid", "cad_score": 0.578, "task_type": "generation"}, "jig-04-slot-and-2-holes-plate": {"status": "valid", "cad_score": 0.8948, "task_type": "generation"}, "nist-ctc-01": {"status": "valid", "cad_score": 0.478, "task_type": "generation"}, "nist-ctc-03": {"status": "valid", "cad_score": 0.3619, "task_type": "generation"}, "nist-ctc-05": {"status": "valid", "cad_score": 0.4166, "task_type": "generation"}}, "submission_blob_url": "https://huggingface.co/datasets/michaelr27/cadgenbench-submissions/resolve/main/submissions/HF_build123d_baseline_gpt-5.5.zip"}