step 3: gradio leaderboard scaffold
Browse filesReplace the static placeholder index with a Gradio app: Leaderboard tab
backed by a hand-crafted results.jsonl, Submit tab (UI-only stub, real
eval wiring lands in step 6), About tab. Org slugs parameterised via
HF_ORG / HF_SUBMISSIONS_REPO / HF_DATA_REPO env vars so the future move
to the science org is a secret change, not a code change.
Files:
- app.py: gradio 6.14 app, three tabs
- requirements.txt: gradio, pandas, huggingface_hub, datasets
- results.jsonl: two dev seed rows
- README.md: sdk: static -> sdk: gradio
- legacy/: archive of the prototype nist comparison HTMLs
- .gitignore: __pycache__, .venv, .gradio
Schema: validity_rate (float [0,1]) instead of valid (bool); notes kept
in the row but hidden from the visible table. All copy uses "AI-driven"
rather than "LLM-driven".
Co-authored-by: Cursor <cursoragent@cursor.com>
- .gitignore +6 -0
- README.md +18 -6
- app.py +180 -0
- index.html +0 -14
- nist_comparison_3d.html → legacy/nist_comparison_3d.html +0 -0
- nist_hf_comparison_3d.html → legacy/nist_hf_comparison_3d.html +0 -0
- requirements.txt +4 -0
- results.jsonl +2 -0
- style.css +0 -28
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
.venv/
|
| 4 |
+
venv/
|
| 5 |
+
.env
|
| 6 |
+
.gradio/
|
|
@@ -1,11 +1,23 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: pink
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: CADGenBench Leaderboard
|
| 3 |
+
emoji: 🔧
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: pink
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 6.14.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
short_description: Leaderboard for AI-driven CAD generation
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# CADGenBench Leaderboard
|
| 14 |
+
|
| 15 |
+
A benchmark for AI-driven CAD generation: how well can a model turn a
|
| 16 |
+
textual or visual description of a mechanical part into a valid,
|
| 17 |
+
geometrically correct 3D model?
|
| 18 |
+
|
| 19 |
+
**Status:** in active development under `michaelr27/AI4Engineering`. Will
|
| 20 |
+
move to `science/cadgenbench-leaderboard` before going public. Reference
|
| 21 |
+
shape inspired by [`adyen/DABstep`](https://huggingface.co/spaces/adyen/DABstep).
|
| 22 |
+
|
| 23 |
+
The previous static-prototype HTML artifacts live in [`legacy/`](legacy/).
|
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CADGenBench Leaderboard Space.
|
| 2 |
+
|
| 3 |
+
Step 3 prototype: a hand-crafted ``results.jsonl`` drives the leaderboard
|
| 4 |
+
table, and the Submit tab is a UI-only stub. The read path (Step 5) will
|
| 5 |
+
swap the JSONL for ``datasets.load_dataset(HF_SUBMISSIONS_REPO, 'results')``
|
| 6 |
+
and the write path (Step 6) will run ``cadgenbench evaluate`` and push a
|
| 7 |
+
result row back to the submissions dataset via ``HfApi``.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
import gradio as gr
|
| 17 |
+
import pandas as pd
|
| 18 |
+
|
| 19 |
+
HF_ORG = os.getenv("HF_ORG", "michaelr27")
|
| 20 |
+
HF_SUBMISSIONS_REPO = os.getenv(
|
| 21 |
+
"HF_SUBMISSIONS_REPO", f"{HF_ORG}/cadgenbench-submissions"
|
| 22 |
+
)
|
| 23 |
+
HF_DATA_REPO = os.getenv("HF_DATA_REPO", f"{HF_ORG}/cadgenbench-data")
|
| 24 |
+
|
| 25 |
+
LOCAL_RESULTS_PATH = Path(__file__).parent / "results.jsonl"
|
| 26 |
+
|
| 27 |
+
LEADERBOARD_COLS = [
|
| 28 |
+
"model",
|
| 29 |
+
"submitter_name",
|
| 30 |
+
"aggregate_score",
|
| 31 |
+
"validity_rate",
|
| 32 |
+
"submitted_at",
|
| 33 |
+
"cadgenbench_version",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def load_leaderboard() -> pd.DataFrame:
|
| 38 |
+
if not LOCAL_RESULTS_PATH.exists():
|
| 39 |
+
return pd.DataFrame(columns=LEADERBOARD_COLS)
|
| 40 |
+
rows = [
|
| 41 |
+
json.loads(line)
|
| 42 |
+
for line in LOCAL_RESULTS_PATH.read_text().splitlines()
|
| 43 |
+
if line.strip()
|
| 44 |
+
]
|
| 45 |
+
if not rows:
|
| 46 |
+
return pd.DataFrame(columns=LEADERBOARD_COLS)
|
| 47 |
+
df = pd.DataFrame(rows)
|
| 48 |
+
cols = [c for c in LEADERBOARD_COLS if c in df.columns]
|
| 49 |
+
return (
|
| 50 |
+
df[cols]
|
| 51 |
+
.sort_values("aggregate_score", ascending=False, na_position="last")
|
| 52 |
+
.reset_index(drop=True)
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def handle_submit(
|
| 57 |
+
zip_file,
|
| 58 |
+
model: str,
|
| 59 |
+
submitter: str,
|
| 60 |
+
agent_url: str,
|
| 61 |
+
notes: str,
|
| 62 |
+
agree: bool,
|
| 63 |
+
) -> str:
|
| 64 |
+
if zip_file is None:
|
| 65 |
+
return "**Error:** please attach a submission zip."
|
| 66 |
+
if not model.strip():
|
| 67 |
+
return "**Error:** please fill in the Model identifier."
|
| 68 |
+
if not submitter.strip():
|
| 69 |
+
return "**Error:** please fill in your Submitter name."
|
| 70 |
+
if not agree:
|
| 71 |
+
return "**Error:** you must agree to publish before submitting."
|
| 72 |
+
|
| 73 |
+
name = Path(zip_file.name).name
|
| 74 |
+
return (
|
| 75 |
+
f"Received `{name}` for model `{model}` by `{submitter}`.\n\n"
|
| 76 |
+
f"_Evaluation is not wired yet (Step 6 of the build plan). Once it "
|
| 77 |
+
f"is, this submission will run the CPU eval inline and append a row "
|
| 78 |
+
f"to `{HF_SUBMISSIONS_REPO}`._"
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
ABOUT_MD = f"""## About
|
| 83 |
+
|
| 84 |
+
**CADGenBench** evaluates AI-driven CAD generation: how well a model can
|
| 85 |
+
turn a description of a mechanical part into a valid, geometrically
|
| 86 |
+
correct 3D model.
|
| 87 |
+
|
| 88 |
+
- Reference baseline: an iterative AI agent that writes build123d Python.
|
| 89 |
+
- Submission flow: upload a zip of per-fixture STEP files; the Space runs
|
| 90 |
+
the CPU eval and appends a row to the submissions dataset.
|
| 91 |
+
- Datasets: fixtures (inputs + ground truth) live in `{HF_DATA_REPO}`;
|
| 92 |
+
submissions and computed results live in `{HF_SUBMISSIONS_REPO}`.
|
| 93 |
+
|
| 94 |
+
### Status
|
| 95 |
+
|
| 96 |
+
This Space is in **active development** under `{HF_ORG}/AI4Engineering` and
|
| 97 |
+
will move to `science/cadgenbench-leaderboard` before going public. See
|
| 98 |
+
`space-setup/` in the source tree for the full build plan.
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
with gr.Blocks(title="CADGenBench Leaderboard") as app:
|
| 102 |
+
gr.Markdown(
|
| 103 |
+
"# CADGenBench Leaderboard\n"
|
| 104 |
+
"_Benchmarking AI-driven CAD generation._"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
with gr.Tab("Leaderboard"):
|
| 108 |
+
df_view = gr.Dataframe(
|
| 109 |
+
value=load_leaderboard(),
|
| 110 |
+
interactive=False,
|
| 111 |
+
wrap=True,
|
| 112 |
+
label="Results (sorted by aggregate CAD score)",
|
| 113 |
+
)
|
| 114 |
+
refresh_btn = gr.Button("Refresh", size="sm")
|
| 115 |
+
refresh_btn.click(fn=load_leaderboard, outputs=df_view)
|
| 116 |
+
|
| 117 |
+
with gr.Tab("Submit"):
|
| 118 |
+
gr.Markdown(
|
| 119 |
+
f"""
|
| 120 |
+
**Submission format.** A single zip with:
|
| 121 |
+
|
| 122 |
+
- one folder per fixture in `{HF_DATA_REPO}`, each containing `output.step`;
|
| 123 |
+
- a top-level `meta.json`:
|
| 124 |
+
|
| 125 |
+
```json
|
| 126 |
+
{{
|
| 127 |
+
"submitter_name": "your name or team",
|
| 128 |
+
"model": "anthropic/claude-sonnet-4-6",
|
| 129 |
+
"agent_url": "https://github.com/... (optional)",
|
| 130 |
+
"notes": "free text, optional, max 500 chars, single line, plain text",
|
| 131 |
+
"agree_to_publish": true
|
| 132 |
+
}}
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
**Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
|
| 136 |
+
and stripped to a single line. Shown in the per-submission detail view,
|
| 137 |
+
not in the main leaderboard table.
|
| 138 |
+
|
| 139 |
+
The Space runs the CPU eval inline and appends a row to
|
| 140 |
+
`{HF_SUBMISSIONS_REPO}`. You can fill the fields below to override
|
| 141 |
+
`meta.json` for a quick test.
|
| 142 |
+
"""
|
| 143 |
+
)
|
| 144 |
+
zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
|
| 145 |
+
with gr.Row():
|
| 146 |
+
model_in = gr.Textbox(
|
| 147 |
+
label="Model identifier",
|
| 148 |
+
placeholder="e.g. anthropic/claude-sonnet-4-6",
|
| 149 |
+
)
|
| 150 |
+
submitter_in = gr.Textbox(label="Submitter name")
|
| 151 |
+
with gr.Row():
|
| 152 |
+
agent_url_in = gr.Textbox(
|
| 153 |
+
label="Agent / paper URL (optional)",
|
| 154 |
+
placeholder="https://github.com/...",
|
| 155 |
+
)
|
| 156 |
+
notes_in = gr.Textbox(label="Notes (optional)")
|
| 157 |
+
agree_in = gr.Checkbox(
|
| 158 |
+
label="I agree to publish this result on the public leaderboard."
|
| 159 |
+
)
|
| 160 |
+
submit_btn = gr.Button("Submit", variant="primary")
|
| 161 |
+
submit_out = gr.Markdown()
|
| 162 |
+
submit_btn.click(
|
| 163 |
+
fn=handle_submit,
|
| 164 |
+
inputs=[
|
| 165 |
+
zip_in,
|
| 166 |
+
model_in,
|
| 167 |
+
submitter_in,
|
| 168 |
+
agent_url_in,
|
| 169 |
+
notes_in,
|
| 170 |
+
agree_in,
|
| 171 |
+
],
|
| 172 |
+
outputs=submit_out,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
with gr.Tab("About"):
|
| 176 |
+
gr.Markdown(ABOUT_MD)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
if __name__ == "__main__":
|
| 180 |
+
app.launch(theme=gr.themes.Soft())
|
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
<!doctype html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset="utf-8" />
|
| 5 |
-
<meta name="viewport" content="width=device-width" />
|
| 6 |
-
<title>My static Space</title>
|
| 7 |
-
<link rel="stylesheet" href="style.css" />
|
| 8 |
-
</head>
|
| 9 |
-
<body>
|
| 10 |
-
<div id="app"></div>
|
| 11 |
-
|
| 12 |
-
<hr> Hello </hr>
|
| 13 |
-
</body>
|
| 14 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==6.14.0
|
| 2 |
+
pandas>=2.0
|
| 3 |
+
huggingface_hub>=0.27.0
|
| 4 |
+
datasets>=3.0
|
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"submission_id": "demo-001", "submitter_name": "Reference (dev seed)", "model": "anthropic/claude-sonnet-4-6", "agent_url": "https://github.com/MichaelRabinovich/LeForge", "notes": "seed row for UI dev", "submitted_at": "2026-05-26T08:00:00Z", "cadgenbench_version": "0.0.0-dev", "cadgenbench_data_revision": "stub", "validity_rate": 1.0, "aggregate_score": 0.42, "per_fixture_scores": {"jig-01-single-hole-plate": 0.85, "jig-02-4hole-pattern-plate": 0.31}, "submission_blob_url": null}
|
| 2 |
+
{"submission_id": "demo-002", "submitter_name": "Reference (dev seed)", "model": "openai/gpt-5.3", "agent_url": null, "notes": "second seed row", "submitted_at": "2026-05-26T08:30:00Z", "cadgenbench_version": "0.0.0-dev", "cadgenbench_data_revision": "stub", "validity_rate": 0.5, "aggregate_score": 0.18, "per_fixture_scores": {"jig-01-single-hole-plate": 0.36, "jig-02-4hole-pattern-plate": 0.0}, "submission_blob_url": null}
|
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
body {
|
| 2 |
-
padding: 2rem;
|
| 3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
| 4 |
-
}
|
| 5 |
-
|
| 6 |
-
h1 {
|
| 7 |
-
font-size: 16px;
|
| 8 |
-
margin-top: 0;
|
| 9 |
-
}
|
| 10 |
-
|
| 11 |
-
p {
|
| 12 |
-
color: rgb(107, 114, 128);
|
| 13 |
-
font-size: 15px;
|
| 14 |
-
margin-bottom: 10px;
|
| 15 |
-
margin-top: 5px;
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
.card {
|
| 19 |
-
max-width: 620px;
|
| 20 |
-
margin: 0 auto;
|
| 21 |
-
padding: 16px;
|
| 22 |
-
border: 1px solid lightgray;
|
| 23 |
-
border-radius: 16px;
|
| 24 |
-
}
|
| 25 |
-
|
| 26 |
-
.card p:last-child {
|
| 27 |
-
margin-bottom: 0;
|
| 28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|