Michael Rabinovich Cursor commited on
Commit
ba3eefb
·
1 Parent(s): 2893b22

leaderboard: backfill tool to grid-ify already-published reports

Browse files

Rewrites the summary view of published reports/<id>.html in the submissions
dataset from the old flat table to the thumbnail grid, in place and without
re-evaluating. Parses each report's existing rows (sample/status/CAD) + the
render-bucket base already embedded in the file, classifies editing by the
starting-shape renders (so invalid edits group correctly), and rebuilds the
grid via single_run's shared builders (byte-identical to a fresh report),
pointing editing cards at edit_diff.png and generation cards at output iso.
Injects the shared grid CSS/JS; detail cards, header and download button are
left untouched. Supports --files and --dataset (--dry-run); idempotent.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (1) hide show
  1. tools/backfill_report_grid.py +242 -0
tools/backfill_report_grid.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Backfill the thumbnail-grid summary view into already-published reports.
3
+
4
+ The report generator (``cadgenbench.eval.report.single_run``) now renders the
5
+ summary view as a grouped thumbnail grid instead of a flat table. Reports
6
+ produced before that change are static HTML files in the submissions dataset
7
+ (``reports/<id>.html``); changing the generator does nothing to them. This
8
+ one-time tool rewrites those published reports **in place, without re-evaluating
9
+ or regenerating from run dirs**:
10
+
11
+ - it reads each report's existing summary table (sample number, status, CAD
12
+ score) and detail cards (which fixtures are editing) plus the render-bucket
13
+ base URL already embedded in the file;
14
+ - rebuilds the summary view as the grid using the *shared* builders from
15
+ ``single_run`` (so a backfilled report is byte-identical to a freshly
16
+ generated one), pointing editing cards at the ``edit_diff.png`` still and
17
+ generation cards at the output ``iso.png`` — all assets that already exist;
18
+ - injects the shared grid CSS/JS; the detail cards, header, score text and
19
+ download button are left untouched.
20
+
21
+ Run on local files (writes alongside, good for eyeballing)::
22
+
23
+ python tools/backfill_report_grid.py --files /tmp/report.html -o /tmp/out.html
24
+
25
+ Rewrite every published report in the submissions dataset (needs a write token)::
26
+
27
+ HF_TOKEN=<write-token> python tools/backfill_report_grid.py --dataset
28
+ python tools/backfill_report_grid.py --dataset --dry-run # list only
29
+ """
30
+ from __future__ import annotations
31
+
32
+ import argparse
33
+ import os
34
+ import re
35
+ import sys
36
+ from pathlib import Path
37
+
38
+ from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
39
+
40
+ # cadgenbench (for the shared grid builders) must be importable.
41
+ _REPO_ROOT = Path(__file__).resolve().parents[2]
42
+ _SRC = _REPO_ROOT / "cadgenbench" / "src"
43
+ if _SRC.is_dir():
44
+ sys.path.insert(0, str(_SRC))
45
+
46
+ from cadgenbench.eval.report.single_run import ( # noqa: E402
47
+ _GRID_CSS,
48
+ _GRID_JS,
49
+ _render_grid_controls,
50
+ grid_card_html,
51
+ render_grid_groups,
52
+ )
53
+
54
+ HF_ORG = os.getenv("HF_ORG", "HuggingAI4Engineering")
55
+ SUBMISSIONS_REPO = os.getenv("HF_SUBMISSIONS_REPO", f"{HF_ORG}/cadgenbench-submissions")
56
+ INPUT_PROXY_BASE = "/task-input"
57
+ EDIT_DIFF_STILL = "edit_diff.png"
58
+
59
+ # --- parsing the old flat-table report -------------------------------------
60
+ _RENDER_BASE_RE = re.compile(
61
+ r'(https?://[^\s"\']+?/resolve/renders/[^/"\']+)/[^/"\']+/[^"\']+\.(?:png|webp)'
62
+ )
63
+ _ROW_RE = re.compile(
64
+ r'<tr class="q-[a-z]+" onclick="showDetail\((\d+)\)"[^>]*>(.*?)</tr>', re.S
65
+ )
66
+ _NAME_RE = re.compile(r"<td>([^<]+)</td>")
67
+ _STATUS_RE = re.compile(r'status-pill status-\w+">([^<]+)<')
68
+ _CAD_RE = re.compile(r'<td data-v="([^"]+)"><b>')
69
+ _SUMMARY_VIEW_RE = re.compile(r'(<div id="summary-view">).*?(</div>)', re.S)
70
+ _GRID_HELP = (
71
+ '<p class="grid-help">Click a card to view details. '
72
+ '<span class="kbd">j</span>/<span class="kbd">k</span> to navigate, '
73
+ '<span class="kbd">Esc</span> to return. Each card shows the input and the '
74
+ "candidate output. Score tint: "
75
+ "<span class='gtint q-high'>&ge;0.90</span> "
76
+ "<span class='gtint q-mid'>&ge;0.60</span> "
77
+ "<span class='gtint q-low'>&lt;0.60</span> CAD score.</p>"
78
+ )
79
+
80
+
81
+ _INPUT_SHAPE_RE = re.compile(re.escape(INPUT_PROXY_BASE) + r"/[^\"']+/renders/")
82
+
83
+
84
+ def _editing_idxs(doc: str) -> set[int]:
85
+ """Indices whose detail card is an editing task (has a STEP input).
86
+
87
+ Detected by the Input column showing the *starting shape's* renders
88
+ (``/task-input/<fixture>/renders/...``), which the report emits for every
89
+ editing sample because it derives from the ``input.step`` input. This is
90
+ deliberately not keyed on the edit-diff turntable / ``(edit diff)`` heading:
91
+ the old generator rendered an *invalid* editing candidate with the
92
+ generation layout (no diff), so those markers miss invalid edits, whereas
93
+ the starting-shape renders are always present. Matches the new generator's
94
+ ``wants_shape`` grouping so a backfilled report and a freshly generated one
95
+ classify identically.
96
+ """
97
+ out: set[int] = set()
98
+ for block in doc.split('<div class="fixture-card"')[1:]:
99
+ m = re.match(r'\s*data-idx="(\d+)"', block)
100
+ if m and _INPUT_SHAPE_RE.search(block):
101
+ out.add(int(m.group(1)))
102
+ return out
103
+
104
+
105
+ def rewrite_report_html(doc: str) -> str | None:
106
+ """Return the report rewritten with the grid summary view, or ``None``.
107
+
108
+ ``None`` means "leave unchanged": the report is already a grid, or it
109
+ isn't a hosted report we can rebuild (no render-bucket URL to point the
110
+ output thumbnails at)."""
111
+ if 'class="ggrid"' in doc or 'id="groups"' in doc:
112
+ return None # already backfilled
113
+ base_m = _RENDER_BASE_RE.search(doc)
114
+ if not base_m:
115
+ return None # not a hosted report (e.g. base64-inlined local report)
116
+ render_base = base_m.group(1)
117
+ edit_idxs = _editing_idxs(doc)
118
+
119
+ gen_cards: list[str] = []
120
+ edit_cards: list[str] = []
121
+ for m in _ROW_RE.finditer(doc):
122
+ idx = int(m.group(1))
123
+ cells = m.group(2)
124
+ name_m = _NAME_RE.search(cells)
125
+ if not name_m:
126
+ continue
127
+ name = name_m.group(1).strip()
128
+ status_m = _STATUS_RE.search(cells)
129
+ status = status_m.group(1).strip() if status_m else "?"
130
+ cad_m = _CAD_RE.search(cells)
131
+ cad: float | None = None
132
+ if cad_m:
133
+ try:
134
+ v = float(cad_m.group(1))
135
+ cad = v if v >= 0 else None
136
+ except ValueError:
137
+ cad = None
138
+ is_editing = idx in edit_idxs
139
+ if is_editing:
140
+ in_src = f"{INPUT_PROXY_BASE}/{name}/renders/iso.png"
141
+ out_src = f"{render_base}/{name}/{EDIT_DIFF_STILL}"
142
+ else:
143
+ in_src = f"{INPUT_PROXY_BASE}/{name}/input.png"
144
+ out_src = f"{render_base}/{name}/iso.png"
145
+ card = grid_card_html(
146
+ idx=idx, name=name, is_editing=is_editing, status=status,
147
+ cad=cad, in_src=in_src, out_src=out_src,
148
+ )
149
+ (edit_cards if is_editing else gen_cards).append(card)
150
+
151
+ if not gen_cards and not edit_cards:
152
+ return None
153
+
154
+ new_inner = _GRID_HELP + _render_grid_controls() + render_grid_groups(
155
+ gen_cards, edit_cards,
156
+ )
157
+ if not _SUMMARY_VIEW_RE.search(doc):
158
+ return None
159
+ doc = _SUMMARY_VIEW_RE.sub(
160
+ lambda mm: mm.group(1) + new_inner + "</div>", doc, count=1,
161
+ )
162
+ # Inject the shared grid styles + filtering behavior.
163
+ doc = doc.replace("</style>", _GRID_CSS + "</style>", 1)
164
+ doc = doc.replace("</body>", f"<script>{_GRID_JS}</script></body>", 1)
165
+ return doc
166
+
167
+
168
+ def _run_files(files: list[Path], out: Path | None) -> int:
169
+ for f in files:
170
+ doc = f.read_text()
171
+ new = rewrite_report_html(doc)
172
+ if new is None:
173
+ print(f" SKIP {f} (already grid / not a hosted report)")
174
+ continue
175
+ dest = out or f
176
+ dest.write_text(new)
177
+ print(f" wrote {dest} ({len(new) // 1024} KB)")
178
+ return 0
179
+
180
+
181
+ def _run_dataset(api: HfApi, token: str | None, dry_run: bool, limit: int | None) -> int:
182
+ files = [
183
+ f for f in api.list_repo_files(SUBMISSIONS_REPO, repo_type="dataset")
184
+ if f.startswith("reports/") and f.endswith(".html")
185
+ ]
186
+ files.sort()
187
+ if limit is not None:
188
+ files = files[:limit]
189
+ print(f"Found {len(files)} report(s) in {SUBMISSIONS_REPO}.")
190
+ ops: list[CommitOperationAdd] = []
191
+ for i, rel in enumerate(files, start=1):
192
+ local = hf_hub_download(
193
+ repo_id=SUBMISSIONS_REPO, filename=rel, repo_type="dataset", token=token,
194
+ )
195
+ new = rewrite_report_html(Path(local).read_text())
196
+ if new is None:
197
+ print(f" [{i}/{len(files)}] SKIP {rel} (already grid / not hosted)")
198
+ continue
199
+ print(f" [{i}/{len(files)}] {rel} -> grid ({len(new) // 1024} KB)")
200
+ if not dry_run:
201
+ ops.append(CommitOperationAdd(path_in_repo=rel, path_or_fileobj=new.encode()))
202
+ if dry_run:
203
+ print(f"Dry run: would rewrite {len([f for f in files])} candidate(s).")
204
+ return 0
205
+ if not ops:
206
+ print("Nothing to rewrite.")
207
+ return 0
208
+ if not token:
209
+ print("HF_TOKEN required to commit.", file=sys.stderr)
210
+ return 2
211
+ api.create_commit(
212
+ repo_id=SUBMISSIONS_REPO, repo_type="dataset", operations=ops,
213
+ commit_message="reports: backfill thumbnail-grid summary view",
214
+ )
215
+ print(f"Committed {len(ops)} rewritten report(s) to {SUBMISSIONS_REPO}.")
216
+ return 0
217
+
218
+
219
+ def main() -> int:
220
+ parser = argparse.ArgumentParser(description=__doc__)
221
+ src = parser.add_mutually_exclusive_group(required=True)
222
+ src.add_argument("--files", nargs="+", type=Path, help="Local report HTML files.")
223
+ src.add_argument(
224
+ "--dataset", action="store_true",
225
+ help="Rewrite every reports/*.html in the submissions dataset.",
226
+ )
227
+ parser.add_argument("-o", "--output", type=Path, help="Output path (single --files).")
228
+ parser.add_argument("--dry-run", action="store_true", help="List only (dataset mode).")
229
+ parser.add_argument("--limit", type=int, default=None)
230
+ args = parser.parse_args()
231
+
232
+ if args.files:
233
+ if args.output and len(args.files) != 1:
234
+ parser.error("-o/--output only valid with a single --files argument.")
235
+ return _run_files(args.files, args.output)
236
+
237
+ token = os.environ.get("HF_TOKEN")
238
+ return _run_dataset(HfApi(token=token), token, args.dry_run, args.limit)
239
+
240
+
241
+ if __name__ == "__main__":
242
+ raise SystemExit(main())