Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

CADGenBench / tools /backfill_report_gt_edit_diff.py

Michael Rabinovich

Bump CADGENBENCH_SHA to 037eade; add GT edit-diff report backfill tool

b885669 2 days ago

7.95 kB

	#!/usr/bin/env python3
	"""Backfill the GT "answer key" edit diff into existing reports' GT column.

	The report generator now renders, for editing fixtures, the Ground Truth
	column as the GT answer-key edit-diff turntable (blue = the correct change vs
	the input) instead of the four static orthographic views -- see ``cadgenbench``
	``eval/report/single_run.py`` (``_render_gt_edit_diff``). The webp is a property
	of the data revision and already lives in the GT dataset at
	``<fixture>/renders/edit_diff_gt.webp`` (rendered once by
	``tools/generate_gt_edit_diff.py``), served through the existing ``/gt`` proxy.

	Reports published before that change still show the four GT views. Rather than
	re-running ``evaluate`` (the metrics are unchanged), this one-off tool patches
	the already-stored ``reports/<id>.html`` files in place: for every editing
	fixture card (identified by the following "Output vs ground truth (edit diff)"
	column) it swaps the GT column's heading + body for the answer-key turntable,
	then re-uploads.

	Idempotent: a patched card's heading is "Ground truth (correct change)", so the
	``<h3>Ground Truth</h3>`` anchor is gone and re-running is a no-op. Generation
	fixtures (no edit-diff column following) are left untouched.

	Usage (dry-run lists what would change; nothing is written)::

	python tools/backfill_report_gt_edit_diff.py

	# actually patch + re-upload (needs a write-scoped HF_TOKEN):
	python tools/backfill_report_gt_edit_diff.py --apply
	"""
	from __future__ import annotations

	import argparse
	import logging
	import re
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download # noqa: E402

	from leaderboard import ( # noqa: E402
	HF_SUBMISSIONS_REPO,
	_load_rows_from_hub,
	_report_relative_url,
	)
	from submit import REPORTS_DIR # noqa: E402

	logger = logging.getLogger(__name__)

	# Blue answer-key legend, byte-for-byte what _legend_html(_GT_EDIT_DIFF_LEGEND)
	# emits in single_run.py (verified against the live generator in main() when
	# importable) so a patched report is indistinguishable from a fresh one.
	_GT_LEGEND_HTML = (
	'<span class="legend">'
	'<span class="legend-chip" style="background:#2173f5"></span>'
	"correct change (ground truth)"
	"</span>"
	)

	# An editing fixture's GT column: the "<div class='col'>" wrapping the
	# "Ground Truth" heading and its four view tiles, up to the column-closing
	# "</div>" that sits immediately before the "Output vs ground truth (edit diff)"
	# column. The lookahead is what restricts the swap to editing fixtures (a
	# generation fixture's GT column is followed by "Output (aligned)" instead).
	#
	# The body is a tempered match: it may contain the nested ``<div class="images">``
	# / ``<div class="view">`` tiles but must NOT cross another ``<h3>`` or
	# ``<div class="col">`` -- otherwise the lazy ``.?`` would, on a generation*
	# fixture, run forward across many cards until it found the next editing
	# fixture's edit-diff column and swallow every GT column in between.
	_GT_COL_RE = re.compile(
	r'<div class="col">\s*<h3>Ground Truth</h3>'
	r'(?P<body>(?:(?!<h3>\|<div class="col">).)*?)</div>'
	r'(?=\s<div class="col">\s<h3>Output vs ground truth \(edit diff\))',
	re.DOTALL,
	)
	_FIXTURE_RE = re.compile(r"/gt/([^/\"]+)/renders/")


	def _new_gt_col(fixture: str) -> str:
	"""The replacement GT column, matching single_run._render_gt_edit_diff."""
	tile = (
	'<div class="images">'
	'<div class="view"><span class="imgwrap">'
	f'<img src="/gt/{fixture}/renders/edit_diff_gt.webp" '
	'alt="ground truth (edit diff)" class="zoomable" loading="lazy">'
	"</span><span>correct change</span></div>"
	"</div>"
	)
	return (
	'<div class="col">\n'
	f"<h3>Ground truth (correct change) {_GT_LEGEND_HTML}</h3>\n"
	f"{tile}\n"
	"</div>"
	)


	def patch_html(doc: str) -> tuple[str \| None, int]:
	"""Return ``(patched_html_or_None, n_columns_swapped)``.

	Swaps every editing fixture's GT column for the answer-key turntable.
	Returns ``None`` when nothing matched (already patched, or no editing
	fixtures), so the caller skips the re-upload.
	"""
	count = 0

	def repl(m: re.Match) -> str:
	nonlocal count
	fx = _FIXTURE_RE.search(m.group("body"))
	if not fx:
	return m.group(0) # no GT render to read the fixture id from; leave
	count += 1
	return _new_gt_col(fx.group(1))

	new = _GT_COL_RE.sub(repl, doc)
	if count == 0 or new == doc:
	return None, 0
	return new, count


	def _self_check() -> None:
	"""Fail loudly if the live generator's GT answer-key markup has drifted."""
	try:
	from cadgenbench.eval.report import single_run as sr # noqa: PLC0415
	except Exception as e: # noqa: BLE001 - generator not importable here
	logger.info(" (skipped self-check: cadgenbench not importable: %s)", e)
	return
	live_legend = sr._legend_html(sr._GT_EDIT_DIFF_LEGEND)
	if live_legend != _GT_LEGEND_HTML:
	raise SystemExit(
	"GT legend drift: backfill string no longer matches single_run.\n"
	f" live: {live_legend}\n here: {_GT_LEGEND_HTML}"
	)
	# The tile markup must match too (build one for a sample fixture id).
	live_tile = sr._render_gt_edit_diff(None, base_url="/gt/999")
	expected = _new_gt_col("999")
	if live_tile not in expected:
	raise SystemExit(
	"GT tile drift: backfill markup no longer matches single_run.\n"
	f" live: {live_tile}\n here: {expected}"
	)
	logger.info(" self-check OK (markup matches live generator)")


	def main() -> int:
	logging.basicConfig(level=logging.INFO, format="%(message)s")
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--apply", action="store_true",
	help="Re-upload patched reports (default is a dry run).",
	)
	args = parser.parse_args()

	_self_check()

	rows = _load_rows_from_hub()
	targets = [
	r for r in rows
	if r.get("submission_id")
	and _report_relative_url(
	r.get("submission_id"), r.get("status"), r.get("submission_sha256"),
	)
	]
	logger.info("Found %d report(s) to consider.", len(targets))

	ops: list[CommitOperationAdd] = []
	skipped = 0
	for row in targets:
	sid = row["submission_id"]
	try:
	local = hf_hub_download(
	repo_id=HF_SUBMISSIONS_REPO,
	repo_type="dataset",
	filename=f"{REPORTS_DIR}/{sid}.html",
	)
	except Exception as e: # noqa: BLE001
	logger.warning(" skip %s: could not fetch report (%s)", sid, e)
	skipped += 1
	continue
	doc = Path(local).read_text(encoding="utf-8")
	patched, n = patch_html(doc)
	if patched is None:
	logger.info(" unchanged %s", sid)
	skipped += 1
	continue
	logger.info(" patched %s (%d editing GT column(s))", sid, n)
	ops.append(
	CommitOperationAdd(
	path_in_repo=f"{REPORTS_DIR}/{sid}.html",
	path_or_fileobj=patched.encode("utf-8"),
	)
	)

	logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped)
	if not ops:
	logger.info("Nothing to do.")
	return 0
	if not args.apply:
	logger.info("Dry run -- re-run with --apply to upload.")
	return 0

	HfApi().create_commit(
	repo_id=HF_SUBMISSIONS_REPO,
	repo_type="dataset",
	operations=ops,
	commit_message="reports: show GT answer-key edit diff for editing fixtures",
	)
	logger.info("Uploaded %d patched report(s).", len(ops))
	return 0


	if __name__ == "__main__":
	sys.exit(main())