#!/usr/bin/env python3 """Backfill the GT "answer key" edit diff into existing reports' GT column. The report generator now renders, for **editing** fixtures, the Ground Truth column as the GT answer-key edit-diff turntable (blue = the correct change vs the input) instead of the four static orthographic views -- see ``cadgenbench`` ``eval/report/single_run.py`` (``_render_gt_edit_diff``). The webp is a property of the data revision and already lives in the GT dataset at ``/renders/edit_diff_gt.webp`` (rendered once by ``tools/generate_gt_edit_diff.py``), served through the existing ``/gt`` proxy. Reports published before that change still show the four GT views. Rather than re-running ``evaluate`` (the metrics are unchanged), this one-off tool patches the already-stored ``reports/.html`` files in place: for every editing fixture card (identified by the following "Output vs ground truth (edit diff)" column) it swaps the GT column's heading + body for the answer-key turntable, then re-uploads. Idempotent: a patched card's heading is "Ground truth (correct change)", so the ``

Ground Truth

`` anchor is gone and re-running is a no-op. Generation fixtures (no edit-diff column following) are left untouched. Usage (dry-run lists what would change; nothing is written):: python tools/backfill_report_gt_edit_diff.py # actually patch + re-upload (needs a write-scoped HF_TOKEN): python tools/backfill_report_gt_edit_diff.py --apply """ from __future__ import annotations import argparse import logging import re import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download # noqa: E402 from leaderboard import ( # noqa: E402 HF_SUBMISSIONS_REPO, _load_rows_from_hub, _report_relative_url, ) from submit import REPORTS_DIR # noqa: E402 logger = logging.getLogger(__name__) # Blue answer-key legend, byte-for-byte what _legend_html(_GT_EDIT_DIFF_LEGEND) # emits in single_run.py (verified against the live generator in main() when # importable) so a patched report is indistinguishable from a fresh one. _GT_LEGEND_HTML = ( '' '' "correct change (ground truth)" "" ) # An editing fixture's GT column: the "
" wrapping the # "Ground Truth" heading and its four view tiles, up to the column-closing # "
" that sits immediately before the "Output vs ground truth (edit diff)" # column. The lookahead is what restricts the swap to editing fixtures (a # generation fixture's GT column is followed by "Output (aligned)" instead). # # The body is a *tempered* match: it may contain the nested ``
`` # / ``
`` tiles but must NOT cross another ``

`` or # ``
`` -- otherwise the lazy ``.*?`` would, on a *generation* # fixture, run forward across many cards until it found the next editing # fixture's edit-diff column and swallow every GT column in between. _GT_COL_RE = re.compile( r'
\s*

Ground Truth

' r'(?P(?:(?!

|
).)*?)
' r'(?=\s*
\s*

Output vs ground truth \(edit diff\))', re.DOTALL, ) _FIXTURE_RE = re.compile(r"/gt/([^/\"]+)/renders/") def _new_gt_col(fixture: str) -> str: """The replacement GT column, matching single_run._render_gt_edit_diff.""" tile = ( '
' '
' f'' "correct change
" "
" ) return ( '
\n' f"

Ground truth (correct change) {_GT_LEGEND_HTML}

\n" f"{tile}\n" "
" ) def patch_html(doc: str) -> tuple[str | None, int]: """Return ``(patched_html_or_None, n_columns_swapped)``. Swaps every editing fixture's GT column for the answer-key turntable. Returns ``None`` when nothing matched (already patched, or no editing fixtures), so the caller skips the re-upload. """ count = 0 def repl(m: re.Match) -> str: nonlocal count fx = _FIXTURE_RE.search(m.group("body")) if not fx: return m.group(0) # no GT render to read the fixture id from; leave count += 1 return _new_gt_col(fx.group(1)) new = _GT_COL_RE.sub(repl, doc) if count == 0 or new == doc: return None, 0 return new, count def _self_check() -> None: """Fail loudly if the live generator's GT answer-key markup has drifted.""" try: from cadgenbench.eval.report import single_run as sr # noqa: PLC0415 except Exception as e: # noqa: BLE001 - generator not importable here logger.info(" (skipped self-check: cadgenbench not importable: %s)", e) return live_legend = sr._legend_html(sr._GT_EDIT_DIFF_LEGEND) if live_legend != _GT_LEGEND_HTML: raise SystemExit( "GT legend drift: backfill string no longer matches single_run.\n" f" live: {live_legend}\n here: {_GT_LEGEND_HTML}" ) # The tile markup must match too (build one for a sample fixture id). live_tile = sr._render_gt_edit_diff(None, base_url="/gt/999") expected = _new_gt_col("999") if live_tile not in expected: raise SystemExit( "GT tile drift: backfill markup no longer matches single_run.\n" f" live: {live_tile}\n here: {expected}" ) logger.info(" self-check OK (markup matches live generator)") def main() -> int: logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--apply", action="store_true", help="Re-upload patched reports (default is a dry run).", ) args = parser.parse_args() _self_check() rows = _load_rows_from_hub() targets = [ r for r in rows if r.get("submission_id") and _report_relative_url( r.get("submission_id"), r.get("status"), r.get("submission_sha256"), ) ] logger.info("Found %d report(s) to consider.", len(targets)) ops: list[CommitOperationAdd] = [] skipped = 0 for row in targets: sid = row["submission_id"] try: local = hf_hub_download( repo_id=HF_SUBMISSIONS_REPO, repo_type="dataset", filename=f"{REPORTS_DIR}/{sid}.html", ) except Exception as e: # noqa: BLE001 logger.warning(" skip %s: could not fetch report (%s)", sid, e) skipped += 1 continue doc = Path(local).read_text(encoding="utf-8") patched, n = patch_html(doc) if patched is None: logger.info(" unchanged %s", sid) skipped += 1 continue logger.info(" patched %s (%d editing GT column(s))", sid, n) ops.append( CommitOperationAdd( path_in_repo=f"{REPORTS_DIR}/{sid}.html", path_or_fileobj=patched.encode("utf-8"), ) ) logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped) if not ops: logger.info("Nothing to do.") return 0 if not args.apply: logger.info("Dry run -- re-run with --apply to upload.") return 0 HfApi().create_commit( repo_id=HF_SUBMISSIONS_REPO, repo_type="dataset", operations=ops, commit_message="reports: show GT answer-key edit diff for editing fixtures", ) logger.info("Uploaded %d patched report(s).", len(ops)) return 0 if __name__ == "__main__": sys.exit(main())