| |
| """Backfill the GT "answer key" edit diff into existing reports' GT column. |
| |
| The report generator now renders, for **editing** fixtures, the Ground Truth |
| column as the GT answer-key edit-diff turntable (blue = the correct change vs |
| the input) instead of the four static orthographic views -- see ``cadgenbench`` |
| ``eval/report/single_run.py`` (``_render_gt_edit_diff``). The webp is a property |
| of the data revision and already lives in the GT dataset at |
| ``<fixture>/renders/edit_diff_gt.webp`` (rendered once by |
| ``tools/generate_gt_edit_diff.py``), served through the existing ``/gt`` proxy. |
| |
| Reports published before that change still show the four GT views. Rather than |
| re-running ``evaluate`` (the metrics are unchanged), this one-off tool patches |
| the already-stored ``reports/<id>.html`` files in place: for every editing |
| fixture card (identified by the following "Output vs ground truth (edit diff)" |
| column) it swaps the GT column's heading + body for the answer-key turntable, |
| then re-uploads. |
| |
| Idempotent: a patched card's heading is "Ground truth (correct change)", so the |
| ``<h3>Ground Truth</h3>`` anchor is gone and re-running is a no-op. Generation |
| fixtures (no edit-diff column following) are left untouched. |
| |
| Usage (dry-run lists what would change; nothing is written):: |
| |
| python tools/backfill_report_gt_edit_diff.py |
| |
| # actually patch + re-upload (needs a write-scoped HF_TOKEN): |
| python tools/backfill_report_gt_edit_diff.py --apply |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import logging |
| import re |
| import sys |
| from pathlib import Path |
|
|
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
|
|
| from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download |
|
|
| from leaderboard import ( |
| HF_SUBMISSIONS_REPO, |
| _load_rows_from_hub, |
| _report_relative_url, |
| ) |
| from submit import REPORTS_DIR |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| |
| |
| _GT_LEGEND_HTML = ( |
| '<span class="legend">' |
| '<span class="legend-chip" style="background:#2173f5"></span>' |
| "correct change (ground truth)" |
| "</span>" |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| _GT_COL_RE = re.compile( |
| r'<div class="col">\s*<h3>Ground Truth</h3>' |
| r'(?P<body>(?:(?!<h3>|<div class="col">).)*?)</div>' |
| r'(?=\s*<div class="col">\s*<h3>Output vs ground truth \(edit diff\))', |
| re.DOTALL, |
| ) |
| _FIXTURE_RE = re.compile(r"/gt/([^/\"]+)/renders/") |
|
|
|
|
| def _new_gt_col(fixture: str) -> str: |
| """The replacement GT column, matching single_run._render_gt_edit_diff.""" |
| tile = ( |
| '<div class="images">' |
| '<div class="view"><span class="imgwrap">' |
| f'<img src="/gt/{fixture}/renders/edit_diff_gt.webp" ' |
| 'alt="ground truth (edit diff)" class="zoomable" loading="lazy">' |
| "</span><span>correct change</span></div>" |
| "</div>" |
| ) |
| return ( |
| '<div class="col">\n' |
| f"<h3>Ground truth (correct change) {_GT_LEGEND_HTML}</h3>\n" |
| f"{tile}\n" |
| "</div>" |
| ) |
|
|
|
|
| def patch_html(doc: str) -> tuple[str | None, int]: |
| """Return ``(patched_html_or_None, n_columns_swapped)``. |
| |
| Swaps every editing fixture's GT column for the answer-key turntable. |
| Returns ``None`` when nothing matched (already patched, or no editing |
| fixtures), so the caller skips the re-upload. |
| """ |
| count = 0 |
|
|
| def repl(m: re.Match) -> str: |
| nonlocal count |
| fx = _FIXTURE_RE.search(m.group("body")) |
| if not fx: |
| return m.group(0) |
| count += 1 |
| return _new_gt_col(fx.group(1)) |
|
|
| new = _GT_COL_RE.sub(repl, doc) |
| if count == 0 or new == doc: |
| return None, 0 |
| return new, count |
|
|
|
|
| def _self_check() -> None: |
| """Fail loudly if the live generator's GT answer-key markup has drifted.""" |
| try: |
| from cadgenbench.eval.report import single_run as sr |
| except Exception as e: |
| logger.info(" (skipped self-check: cadgenbench not importable: %s)", e) |
| return |
| live_legend = sr._legend_html(sr._GT_EDIT_DIFF_LEGEND) |
| if live_legend != _GT_LEGEND_HTML: |
| raise SystemExit( |
| "GT legend drift: backfill string no longer matches single_run.\n" |
| f" live: {live_legend}\n here: {_GT_LEGEND_HTML}" |
| ) |
| |
| live_tile = sr._render_gt_edit_diff(None, base_url="/gt/999") |
| expected = _new_gt_col("999") |
| if live_tile not in expected: |
| raise SystemExit( |
| "GT tile drift: backfill markup no longer matches single_run.\n" |
| f" live: {live_tile}\n here: {expected}" |
| ) |
| logger.info(" self-check OK (markup matches live generator)") |
|
|
|
|
| def main() -> int: |
| logging.basicConfig(level=logging.INFO, format="%(message)s") |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument( |
| "--apply", action="store_true", |
| help="Re-upload patched reports (default is a dry run).", |
| ) |
| args = parser.parse_args() |
|
|
| _self_check() |
|
|
| rows = _load_rows_from_hub() |
| targets = [ |
| r for r in rows |
| if r.get("submission_id") |
| and _report_relative_url( |
| r.get("submission_id"), r.get("status"), r.get("submission_sha256"), |
| ) |
| ] |
| logger.info("Found %d report(s) to consider.", len(targets)) |
|
|
| ops: list[CommitOperationAdd] = [] |
| skipped = 0 |
| for row in targets: |
| sid = row["submission_id"] |
| try: |
| local = hf_hub_download( |
| repo_id=HF_SUBMISSIONS_REPO, |
| repo_type="dataset", |
| filename=f"{REPORTS_DIR}/{sid}.html", |
| ) |
| except Exception as e: |
| logger.warning(" skip %s: could not fetch report (%s)", sid, e) |
| skipped += 1 |
| continue |
| doc = Path(local).read_text(encoding="utf-8") |
| patched, n = patch_html(doc) |
| if patched is None: |
| logger.info(" unchanged %s", sid) |
| skipped += 1 |
| continue |
| logger.info(" patched %s (%d editing GT column(s))", sid, n) |
| ops.append( |
| CommitOperationAdd( |
| path_in_repo=f"{REPORTS_DIR}/{sid}.html", |
| path_or_fileobj=patched.encode("utf-8"), |
| ) |
| ) |
|
|
| logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped) |
| if not ops: |
| logger.info("Nothing to do.") |
| return 0 |
| if not args.apply: |
| logger.info("Dry run -- re-run with --apply to upload.") |
| return 0 |
|
|
| HfApi().create_commit( |
| repo_id=HF_SUBMISSIONS_REPO, |
| repo_type="dataset", |
| operations=ops, |
| commit_message="reports: show GT answer-key edit diff for editing fixtures", |
| ) |
| logger.info("Uploaded %d patched report(s).", len(ops)) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|