Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

Michael Rabinovich Cursor commited on Jun 5

Commit

ba3eefb

1 Parent(s): 2893b22

leaderboard: backfill tool to grid-ify already-published reports

Rewrites the summary view of published reports/<id>.html in the submissions
dataset from the old flat table to the thumbnail grid, in place and without
re-evaluating. Parses each report's existing rows (sample/status/CAD) + the
render-bucket base already embedded in the file, classifies editing by the
starting-shape renders (so invalid edits group correctly), and rebuilds the
grid via single_run's shared builders (byte-identical to a fresh report),
pointing editing cards at edit_diff.png and generation cards at output iso.
Injects the shared grid CSS/JS; detail cards, header and download button are
left untouched. Supports --files and --dataset (--dry-run); idempotent.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (1) hide show

tools/backfill_report_grid.py +242 -0

tools/backfill_report_grid.py ADDED Viewed

	@@ -0,0 +1,242 @@

+#!/usr/bin/env python3
+"""Backfill the thumbnail-grid summary view into already-published reports.
+The report generator (``cadgenbench.eval.report.single_run``) now renders the
+summary view as a grouped thumbnail grid instead of a flat table. Reports
+produced before that change are static HTML files in the submissions dataset
+(``reports/<id>.html``); changing the generator does nothing to them. This
+one-time tool rewrites those published reports **in place, without re-evaluating
+or regenerating from run dirs**:
+- it reads each report's existing summary table (sample number, status, CAD
+  score) and detail cards (which fixtures are editing) plus the render-bucket
+  base URL already embedded in the file;
+- rebuilds the summary view as the grid using the *shared* builders from
+  ``single_run`` (so a backfilled report is byte-identical to a freshly
+  generated one), pointing editing cards at the ``edit_diff.png`` still and
+  generation cards at the output ``iso.png`` — all assets that already exist;
+- injects the shared grid CSS/JS; the detail cards, header, score text and
+  download button are left untouched.
+Run on local files (writes alongside, good for eyeballing)::
+    python tools/backfill_report_grid.py --files /tmp/report.html -o /tmp/out.html
+Rewrite every published report in the submissions dataset (needs a write token)::
+    HF_TOKEN=<write-token> python tools/backfill_report_grid.py --dataset
+    python tools/backfill_report_grid.py --dataset --dry-run    # list only
+"""
+from __future__ import annotations
+import argparse
+import os
+import re
+import sys
+from pathlib import Path
+from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
+# cadgenbench (for the shared grid builders) must be importable.
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+_SRC = _REPO_ROOT / "cadgenbench" / "src"
+if _SRC.is_dir():
+    sys.path.insert(0, str(_SRC))
+from cadgenbench.eval.report.single_run import (  # noqa: E402
+    _GRID_CSS,
+    _GRID_JS,
+    _render_grid_controls,
+    grid_card_html,
+    render_grid_groups,
+)
+HF_ORG = os.getenv("HF_ORG", "HuggingAI4Engineering")
+SUBMISSIONS_REPO = os.getenv("HF_SUBMISSIONS_REPO", f"{HF_ORG}/cadgenbench-submissions")
+INPUT_PROXY_BASE = "/task-input"
+EDIT_DIFF_STILL = "edit_diff.png"
+# --- parsing the old flat-table report -------------------------------------
+_RENDER_BASE_RE = re.compile(
+    r'(https?://[^\s"\']+?/resolve/renders/[^/"\']+)/[^/"\']+/[^"\']+\.(?:png|webp)'
+)
+_ROW_RE = re.compile(
+    r'<tr class="q-[a-z]+" onclick="showDetail\((\d+)\)"[^>]*>(.*?)</tr>', re.S
+)
+_NAME_RE = re.compile(r"<td>([^<]+)</td>")
+_STATUS_RE = re.compile(r'status-pill status-\w+">([^<]+)<')
+_CAD_RE = re.compile(r'<td data-v="([^"]+)"><b>')
+_SUMMARY_VIEW_RE = re.compile(r'(<div id="summary-view">).*?(</div>)', re.S)
+_GRID_HELP = (
+    '<p class="grid-help">Click a card to view details. '
+    '<span class="kbd">j</span>/<span class="kbd">k</span> to navigate, '
+    '<span class="kbd">Esc</span> to return. Each card shows the input and the '
+    "candidate output. Score tint: "
+    "<span class='gtint q-high'>&ge;0.90</span> "
+    "<span class='gtint q-mid'>&ge;0.60</span> "
+    "<span class='gtint q-low'>&lt;0.60</span> CAD score.</p>"
+)
+_INPUT_SHAPE_RE = re.compile(re.escape(INPUT_PROXY_BASE) + r"/[^\"']+/renders/")
+def _editing_idxs(doc: str) -> set[int]:
+    """Indices whose detail card is an editing task (has a STEP input).
+    Detected by the Input column showing the *starting shape's* renders
+    (``/task-input/<fixture>/renders/...``), which the report emits for every
+    editing sample because it derives from the ``input.step`` input. This is
+    deliberately not keyed on the edit-diff turntable / ``(edit diff)`` heading:
+    the old generator rendered an *invalid* editing candidate with the
+    generation layout (no diff), so those markers miss invalid edits, whereas
+    the starting-shape renders are always present. Matches the new generator's
+    ``wants_shape`` grouping so a backfilled report and a freshly generated one
+    classify identically.
+    """
+    out: set[int] = set()
+    for block in doc.split('<div class="fixture-card"')[1:]:
+        m = re.match(r'\s*data-idx="(\d+)"', block)
+        if m and _INPUT_SHAPE_RE.search(block):
+            out.add(int(m.group(1)))
+    return out
+def rewrite_report_html(doc: str) -> str | None:
+    """Return the report rewritten with the grid summary view, or ``None``.
+    ``None`` means "leave unchanged": the report is already a grid, or it
+    isn't a hosted report we can rebuild (no render-bucket URL to point the
+    output thumbnails at)."""
+    if 'class="ggrid"' in doc or 'id="groups"' in doc:
+        return None  # already backfilled
+    base_m = _RENDER_BASE_RE.search(doc)
+    if not base_m:
+        return None  # not a hosted report (e.g. base64-inlined local report)
+    render_base = base_m.group(1)
+    edit_idxs = _editing_idxs(doc)
+    gen_cards: list[str] = []
+    edit_cards: list[str] = []
+    for m in _ROW_RE.finditer(doc):
+        idx = int(m.group(1))
+        cells = m.group(2)
+        name_m = _NAME_RE.search(cells)
+        if not name_m:
+            continue
+        name = name_m.group(1).strip()
+        status_m = _STATUS_RE.search(cells)
+        status = status_m.group(1).strip() if status_m else "?"
+        cad_m = _CAD_RE.search(cells)
+        cad: float | None = None
+        if cad_m:
+            try:
+                v = float(cad_m.group(1))
+                cad = v if v >= 0 else None
+            except ValueError:
+                cad = None
+        is_editing = idx in edit_idxs
+        if is_editing:
+            in_src = f"{INPUT_PROXY_BASE}/{name}/renders/iso.png"
+            out_src = f"{render_base}/{name}/{EDIT_DIFF_STILL}"
+        else:
+            in_src = f"{INPUT_PROXY_BASE}/{name}/input.png"
+            out_src = f"{render_base}/{name}/iso.png"
+        card = grid_card_html(
+            idx=idx, name=name, is_editing=is_editing, status=status,
+            cad=cad, in_src=in_src, out_src=out_src,
+        )
+        (edit_cards if is_editing else gen_cards).append(card)
+    if not gen_cards and not edit_cards:
+        return None
+    new_inner = _GRID_HELP + _render_grid_controls() + render_grid_groups(
+        gen_cards, edit_cards,
+    )
+    if not _SUMMARY_VIEW_RE.search(doc):
+        return None
+    doc = _SUMMARY_VIEW_RE.sub(
+        lambda mm: mm.group(1) + new_inner + "</div>", doc, count=1,
+    )
+    # Inject the shared grid styles + filtering behavior.
+    doc = doc.replace("</style>", _GRID_CSS + "</style>", 1)
+    doc = doc.replace("</body>", f"<script>{_GRID_JS}</script></body>", 1)
+    return doc
+def _run_files(files: list[Path], out: Path | None) -> int:
+    for f in files:
+        doc = f.read_text()
+        new = rewrite_report_html(doc)
+        if new is None:
+            print(f"  SKIP {f} (already grid / not a hosted report)")
+            continue
+        dest = out or f
+        dest.write_text(new)
+        print(f"  wrote {dest} ({len(new) // 1024} KB)")
+    return 0
+def _run_dataset(api: HfApi, token: str | None, dry_run: bool, limit: int | None) -> int:
+    files = [
+        f for f in api.list_repo_files(SUBMISSIONS_REPO, repo_type="dataset")
+        if f.startswith("reports/") and f.endswith(".html")
+    ]
+    files.sort()
+    if limit is not None:
+        files = files[:limit]
+    print(f"Found {len(files)} report(s) in {SUBMISSIONS_REPO}.")
+    ops: list[CommitOperationAdd] = []
+    for i, rel in enumerate(files, start=1):
+        local = hf_hub_download(
+            repo_id=SUBMISSIONS_REPO, filename=rel, repo_type="dataset", token=token,
+        )
+        new = rewrite_report_html(Path(local).read_text())
+        if new is None:
+            print(f"  [{i}/{len(files)}] SKIP {rel} (already grid / not hosted)")
+            continue
+        print(f"  [{i}/{len(files)}] {rel} -> grid ({len(new) // 1024} KB)")
+        if not dry_run:
+            ops.append(CommitOperationAdd(path_in_repo=rel, path_or_fileobj=new.encode()))
+    if dry_run:
+        print(f"Dry run: would rewrite {len([f for f in files])} candidate(s).")
+        return 0
+    if not ops:
+        print("Nothing to rewrite.")
+        return 0
+    if not token:
+        print("HF_TOKEN required to commit.", file=sys.stderr)
+        return 2
+    api.create_commit(
+        repo_id=SUBMISSIONS_REPO, repo_type="dataset", operations=ops,
+        commit_message="reports: backfill thumbnail-grid summary view",
+    )
+    print(f"Committed {len(ops)} rewritten report(s) to {SUBMISSIONS_REPO}.")
+    return 0
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    src = parser.add_mutually_exclusive_group(required=True)
+    src.add_argument("--files", nargs="+", type=Path, help="Local report HTML files.")
+    src.add_argument(
+        "--dataset", action="store_true",
+        help="Rewrite every reports/*.html in the submissions dataset.",
+    )
+    parser.add_argument("-o", "--output", type=Path, help="Output path (single --files).")
+    parser.add_argument("--dry-run", action="store_true", help="List only (dataset mode).")
+    parser.add_argument("--limit", type=int, default=None)
+    args = parser.parse_args()
+    if args.files:
+        if args.output and len(args.files) != 1:
+            parser.error("-o/--output only valid with a single --files argument.")
+        return _run_files(args.files, args.output)
+    token = os.environ.get("HF_TOKEN")
+    return _run_dataset(HfApi(token=token), token, args.dry_run, args.limit)
+if __name__ == "__main__":
+    raise SystemExit(main())