File size: 7,946 Bytes
b885669 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | #!/usr/bin/env python3
"""Backfill the GT "answer key" edit diff into existing reports' GT column.
The report generator now renders, for **editing** fixtures, the Ground Truth
column as the GT answer-key edit-diff turntable (blue = the correct change vs
the input) instead of the four static orthographic views -- see ``cadgenbench``
``eval/report/single_run.py`` (``_render_gt_edit_diff``). The webp is a property
of the data revision and already lives in the GT dataset at
``<fixture>/renders/edit_diff_gt.webp`` (rendered once by
``tools/generate_gt_edit_diff.py``), served through the existing ``/gt`` proxy.
Reports published before that change still show the four GT views. Rather than
re-running ``evaluate`` (the metrics are unchanged), this one-off tool patches
the already-stored ``reports/<id>.html`` files in place: for every editing
fixture card (identified by the following "Output vs ground truth (edit diff)"
column) it swaps the GT column's heading + body for the answer-key turntable,
then re-uploads.
Idempotent: a patched card's heading is "Ground truth (correct change)", so the
``<h3>Ground Truth</h3>`` anchor is gone and re-running is a no-op. Generation
fixtures (no edit-diff column following) are left untouched.
Usage (dry-run lists what would change; nothing is written)::
python tools/backfill_report_gt_edit_diff.py
# actually patch + re-upload (needs a write-scoped HF_TOKEN):
python tools/backfill_report_gt_edit_diff.py --apply
"""
from __future__ import annotations
import argparse
import logging
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download # noqa: E402
from leaderboard import ( # noqa: E402
HF_SUBMISSIONS_REPO,
_load_rows_from_hub,
_report_relative_url,
)
from submit import REPORTS_DIR # noqa: E402
logger = logging.getLogger(__name__)
# Blue answer-key legend, byte-for-byte what _legend_html(_GT_EDIT_DIFF_LEGEND)
# emits in single_run.py (verified against the live generator in main() when
# importable) so a patched report is indistinguishable from a fresh one.
_GT_LEGEND_HTML = (
'<span class="legend">'
'<span class="legend-chip" style="background:#2173f5"></span>'
"correct change (ground truth)"
"</span>"
)
# An editing fixture's GT column: the "<div class='col'>" wrapping the
# "Ground Truth" heading and its four view tiles, up to the column-closing
# "</div>" that sits immediately before the "Output vs ground truth (edit diff)"
# column. The lookahead is what restricts the swap to editing fixtures (a
# generation fixture's GT column is followed by "Output (aligned)" instead).
#
# The body is a *tempered* match: it may contain the nested ``<div class="images">``
# / ``<div class="view">`` tiles but must NOT cross another ``<h3>`` or
# ``<div class="col">`` -- otherwise the lazy ``.*?`` would, on a *generation*
# fixture, run forward across many cards until it found the next editing
# fixture's edit-diff column and swallow every GT column in between.
_GT_COL_RE = re.compile(
r'<div class="col">\s*<h3>Ground Truth</h3>'
r'(?P<body>(?:(?!<h3>|<div class="col">).)*?)</div>'
r'(?=\s*<div class="col">\s*<h3>Output vs ground truth \(edit diff\))',
re.DOTALL,
)
_FIXTURE_RE = re.compile(r"/gt/([^/\"]+)/renders/")
def _new_gt_col(fixture: str) -> str:
"""The replacement GT column, matching single_run._render_gt_edit_diff."""
tile = (
'<div class="images">'
'<div class="view"><span class="imgwrap">'
f'<img src="/gt/{fixture}/renders/edit_diff_gt.webp" '
'alt="ground truth (edit diff)" class="zoomable" loading="lazy">'
"</span><span>correct change</span></div>"
"</div>"
)
return (
'<div class="col">\n'
f"<h3>Ground truth (correct change) {_GT_LEGEND_HTML}</h3>\n"
f"{tile}\n"
"</div>"
)
def patch_html(doc: str) -> tuple[str | None, int]:
"""Return ``(patched_html_or_None, n_columns_swapped)``.
Swaps every editing fixture's GT column for the answer-key turntable.
Returns ``None`` when nothing matched (already patched, or no editing
fixtures), so the caller skips the re-upload.
"""
count = 0
def repl(m: re.Match) -> str:
nonlocal count
fx = _FIXTURE_RE.search(m.group("body"))
if not fx:
return m.group(0) # no GT render to read the fixture id from; leave
count += 1
return _new_gt_col(fx.group(1))
new = _GT_COL_RE.sub(repl, doc)
if count == 0 or new == doc:
return None, 0
return new, count
def _self_check() -> None:
"""Fail loudly if the live generator's GT answer-key markup has drifted."""
try:
from cadgenbench.eval.report import single_run as sr # noqa: PLC0415
except Exception as e: # noqa: BLE001 - generator not importable here
logger.info(" (skipped self-check: cadgenbench not importable: %s)", e)
return
live_legend = sr._legend_html(sr._GT_EDIT_DIFF_LEGEND)
if live_legend != _GT_LEGEND_HTML:
raise SystemExit(
"GT legend drift: backfill string no longer matches single_run.\n"
f" live: {live_legend}\n here: {_GT_LEGEND_HTML}"
)
# The tile markup must match too (build one for a sample fixture id).
live_tile = sr._render_gt_edit_diff(None, base_url="/gt/999")
expected = _new_gt_col("999")
if live_tile not in expected:
raise SystemExit(
"GT tile drift: backfill markup no longer matches single_run.\n"
f" live: {live_tile}\n here: {expected}"
)
logger.info(" self-check OK (markup matches live generator)")
def main() -> int:
logging.basicConfig(level=logging.INFO, format="%(message)s")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--apply", action="store_true",
help="Re-upload patched reports (default is a dry run).",
)
args = parser.parse_args()
_self_check()
rows = _load_rows_from_hub()
targets = [
r for r in rows
if r.get("submission_id")
and _report_relative_url(
r.get("submission_id"), r.get("status"), r.get("submission_sha256"),
)
]
logger.info("Found %d report(s) to consider.", len(targets))
ops: list[CommitOperationAdd] = []
skipped = 0
for row in targets:
sid = row["submission_id"]
try:
local = hf_hub_download(
repo_id=HF_SUBMISSIONS_REPO,
repo_type="dataset",
filename=f"{REPORTS_DIR}/{sid}.html",
)
except Exception as e: # noqa: BLE001
logger.warning(" skip %s: could not fetch report (%s)", sid, e)
skipped += 1
continue
doc = Path(local).read_text(encoding="utf-8")
patched, n = patch_html(doc)
if patched is None:
logger.info(" unchanged %s", sid)
skipped += 1
continue
logger.info(" patched %s (%d editing GT column(s))", sid, n)
ops.append(
CommitOperationAdd(
path_in_repo=f"{REPORTS_DIR}/{sid}.html",
path_or_fileobj=patched.encode("utf-8"),
)
)
logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped)
if not ops:
logger.info("Nothing to do.")
return 0
if not args.apply:
logger.info("Dry run -- re-run with --apply to upload.")
return 0
HfApi().create_commit(
repo_id=HF_SUBMISSIONS_REPO,
repo_type="dataset",
operations=ops,
commit_message="reports: show GT answer-key edit diff for editing fixtures",
)
logger.info("Uploaded %d patched report(s).", len(ops))
return 0
if __name__ == "__main__":
sys.exit(main())
|