CADGenBench / tools /backfill_report_gt_edit_diff.py
Michael Rabinovich
Bump CADGENBENCH_SHA to 037eade; add GT edit-diff report backfill tool
b885669
#!/usr/bin/env python3
"""Backfill the GT "answer key" edit diff into existing reports' GT column.
The report generator now renders, for **editing** fixtures, the Ground Truth
column as the GT answer-key edit-diff turntable (blue = the correct change vs
the input) instead of the four static orthographic views -- see ``cadgenbench``
``eval/report/single_run.py`` (``_render_gt_edit_diff``). The webp is a property
of the data revision and already lives in the GT dataset at
``<fixture>/renders/edit_diff_gt.webp`` (rendered once by
``tools/generate_gt_edit_diff.py``), served through the existing ``/gt`` proxy.
Reports published before that change still show the four GT views. Rather than
re-running ``evaluate`` (the metrics are unchanged), this one-off tool patches
the already-stored ``reports/<id>.html`` files in place: for every editing
fixture card (identified by the following "Output vs ground truth (edit diff)"
column) it swaps the GT column's heading + body for the answer-key turntable,
then re-uploads.
Idempotent: a patched card's heading is "Ground truth (correct change)", so the
``<h3>Ground Truth</h3>`` anchor is gone and re-running is a no-op. Generation
fixtures (no edit-diff column following) are left untouched.
Usage (dry-run lists what would change; nothing is written)::
python tools/backfill_report_gt_edit_diff.py
# actually patch + re-upload (needs a write-scoped HF_TOKEN):
python tools/backfill_report_gt_edit_diff.py --apply
"""
from __future__ import annotations
import argparse
import logging
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download # noqa: E402
from leaderboard import ( # noqa: E402
HF_SUBMISSIONS_REPO,
_load_rows_from_hub,
_report_relative_url,
)
from submit import REPORTS_DIR # noqa: E402
logger = logging.getLogger(__name__)
# Blue answer-key legend, byte-for-byte what _legend_html(_GT_EDIT_DIFF_LEGEND)
# emits in single_run.py (verified against the live generator in main() when
# importable) so a patched report is indistinguishable from a fresh one.
_GT_LEGEND_HTML = (
'<span class="legend">'
'<span class="legend-chip" style="background:#2173f5"></span>'
"correct change (ground truth)"
"</span>"
)
# An editing fixture's GT column: the "<div class='col'>" wrapping the
# "Ground Truth" heading and its four view tiles, up to the column-closing
# "</div>" that sits immediately before the "Output vs ground truth (edit diff)"
# column. The lookahead is what restricts the swap to editing fixtures (a
# generation fixture's GT column is followed by "Output (aligned)" instead).
#
# The body is a *tempered* match: it may contain the nested ``<div class="images">``
# / ``<div class="view">`` tiles but must NOT cross another ``<h3>`` or
# ``<div class="col">`` -- otherwise the lazy ``.*?`` would, on a *generation*
# fixture, run forward across many cards until it found the next editing
# fixture's edit-diff column and swallow every GT column in between.
_GT_COL_RE = re.compile(
r'<div class="col">\s*<h3>Ground Truth</h3>'
r'(?P<body>(?:(?!<h3>|<div class="col">).)*?)</div>'
r'(?=\s*<div class="col">\s*<h3>Output vs ground truth \(edit diff\))',
re.DOTALL,
)
_FIXTURE_RE = re.compile(r"/gt/([^/\"]+)/renders/")
def _new_gt_col(fixture: str) -> str:
"""The replacement GT column, matching single_run._render_gt_edit_diff."""
tile = (
'<div class="images">'
'<div class="view"><span class="imgwrap">'
f'<img src="/gt/{fixture}/renders/edit_diff_gt.webp" '
'alt="ground truth (edit diff)" class="zoomable" loading="lazy">'
"</span><span>correct change</span></div>"
"</div>"
)
return (
'<div class="col">\n'
f"<h3>Ground truth (correct change) {_GT_LEGEND_HTML}</h3>\n"
f"{tile}\n"
"</div>"
)
def patch_html(doc: str) -> tuple[str | None, int]:
"""Return ``(patched_html_or_None, n_columns_swapped)``.
Swaps every editing fixture's GT column for the answer-key turntable.
Returns ``None`` when nothing matched (already patched, or no editing
fixtures), so the caller skips the re-upload.
"""
count = 0
def repl(m: re.Match) -> str:
nonlocal count
fx = _FIXTURE_RE.search(m.group("body"))
if not fx:
return m.group(0) # no GT render to read the fixture id from; leave
count += 1
return _new_gt_col(fx.group(1))
new = _GT_COL_RE.sub(repl, doc)
if count == 0 or new == doc:
return None, 0
return new, count
def _self_check() -> None:
"""Fail loudly if the live generator's GT answer-key markup has drifted."""
try:
from cadgenbench.eval.report import single_run as sr # noqa: PLC0415
except Exception as e: # noqa: BLE001 - generator not importable here
logger.info(" (skipped self-check: cadgenbench not importable: %s)", e)
return
live_legend = sr._legend_html(sr._GT_EDIT_DIFF_LEGEND)
if live_legend != _GT_LEGEND_HTML:
raise SystemExit(
"GT legend drift: backfill string no longer matches single_run.\n"
f" live: {live_legend}\n here: {_GT_LEGEND_HTML}"
)
# The tile markup must match too (build one for a sample fixture id).
live_tile = sr._render_gt_edit_diff(None, base_url="/gt/999")
expected = _new_gt_col("999")
if live_tile not in expected:
raise SystemExit(
"GT tile drift: backfill markup no longer matches single_run.\n"
f" live: {live_tile}\n here: {expected}"
)
logger.info(" self-check OK (markup matches live generator)")
def main() -> int:
logging.basicConfig(level=logging.INFO, format="%(message)s")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--apply", action="store_true",
help="Re-upload patched reports (default is a dry run).",
)
args = parser.parse_args()
_self_check()
rows = _load_rows_from_hub()
targets = [
r for r in rows
if r.get("submission_id")
and _report_relative_url(
r.get("submission_id"), r.get("status"), r.get("submission_sha256"),
)
]
logger.info("Found %d report(s) to consider.", len(targets))
ops: list[CommitOperationAdd] = []
skipped = 0
for row in targets:
sid = row["submission_id"]
try:
local = hf_hub_download(
repo_id=HF_SUBMISSIONS_REPO,
repo_type="dataset",
filename=f"{REPORTS_DIR}/{sid}.html",
)
except Exception as e: # noqa: BLE001
logger.warning(" skip %s: could not fetch report (%s)", sid, e)
skipped += 1
continue
doc = Path(local).read_text(encoding="utf-8")
patched, n = patch_html(doc)
if patched is None:
logger.info(" unchanged %s", sid)
skipped += 1
continue
logger.info(" patched %s (%d editing GT column(s))", sid, n)
ops.append(
CommitOperationAdd(
path_in_repo=f"{REPORTS_DIR}/{sid}.html",
path_or_fileobj=patched.encode("utf-8"),
)
)
logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped)
if not ops:
logger.info("Nothing to do.")
return 0
if not args.apply:
logger.info("Dry run -- re-run with --apply to upload.")
return 0
HfApi().create_commit(
repo_id=HF_SUBMISSIONS_REPO,
repo_type="dataset",
operations=ops,
commit_message="reports: show GT answer-key edit diff for editing fixtures",
)
logger.info("Uploaded %d patched report(s).", len(ops))
return 0
if __name__ == "__main__":
sys.exit(main())