Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

File size: 7,946 Bytes

b885669

#!/usr/bin/env python3
"""Backfill the GT "answer key" edit diff into existing reports' GT column.

The report generator now renders, for **editing** fixtures, the Ground Truth
column as the GT answer-key edit-diff turntable (blue = the correct change vs
the input) instead of the four static orthographic views -- see ``cadgenbench``
``eval/report/single_run.py`` (``_render_gt_edit_diff``). The webp is a property
of the data revision and already lives in the GT dataset at
``<fixture>/renders/edit_diff_gt.webp`` (rendered once by
``tools/generate_gt_edit_diff.py``), served through the existing ``/gt`` proxy.

Reports published before that change still show the four GT views. Rather than
re-running ``evaluate`` (the metrics are unchanged), this one-off tool patches
the already-stored ``reports/<id>.html`` files in place: for every editing
fixture card (identified by the following "Output vs ground truth (edit diff)"
column) it swaps the GT column's heading + body for the answer-key turntable,
then re-uploads.

Idempotent: a patched card's heading is "Ground truth (correct change)", so the
``<h3>Ground Truth</h3>`` anchor is gone and re-running is a no-op. Generation
fixtures (no edit-diff column following) are left untouched.

Usage (dry-run lists what would change; nothing is written)::

    python tools/backfill_report_gt_edit_diff.py

    # actually patch + re-upload (needs a write-scoped HF_TOKEN):
    python tools/backfill_report_gt_edit_diff.py --apply
"""
from __future__ import annotations

import argparse
import logging
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download  # noqa: E402

from leaderboard import (  # noqa: E402
    HF_SUBMISSIONS_REPO,
    _load_rows_from_hub,
    _report_relative_url,
)
from submit import REPORTS_DIR  # noqa: E402

logger = logging.getLogger(__name__)

# Blue answer-key legend, byte-for-byte what _legend_html(_GT_EDIT_DIFF_LEGEND)
# emits in single_run.py (verified against the live generator in main() when
# importable) so a patched report is indistinguishable from a fresh one.
_GT_LEGEND_HTML = (
    '<span class="legend">'
    '<span class="legend-chip" style="background:#2173f5"></span>'
    "correct change (ground truth)"
    "</span>"
)

# An editing fixture's GT column: the "<div class='col'>" wrapping the
# "Ground Truth" heading and its four view tiles, up to the column-closing
# "</div>" that sits immediately before the "Output vs ground truth (edit diff)"
# column. The lookahead is what restricts the swap to editing fixtures (a
# generation fixture's GT column is followed by "Output (aligned)" instead).
#
# The body is a *tempered* match: it may contain the nested ``<div class="images">``
# / ``<div class="view">`` tiles but must NOT cross another ``<h3>`` or
# ``<div class="col">`` -- otherwise the lazy ``.*?`` would, on a *generation*
# fixture, run forward across many cards until it found the next editing
# fixture's edit-diff column and swallow every GT column in between.
_GT_COL_RE = re.compile(
    r'<div class="col">\s*<h3>Ground Truth</h3>'
    r'(?P<body>(?:(?!<h3>|<div class="col">).)*?)</div>'
    r'(?=\s*<div class="col">\s*<h3>Output vs ground truth \(edit diff\))',
    re.DOTALL,
)
_FIXTURE_RE = re.compile(r"/gt/([^/\"]+)/renders/")


def _new_gt_col(fixture: str) -> str:
    """The replacement GT column, matching single_run._render_gt_edit_diff."""
    tile = (
        '<div class="images">'
        '<div class="view"><span class="imgwrap">'
        f'<img src="/gt/{fixture}/renders/edit_diff_gt.webp" '
        'alt="ground truth (edit diff)" class="zoomable" loading="lazy">'
        "</span><span>correct change</span></div>"
        "</div>"
    )
    return (
        '<div class="col">\n'
        f"<h3>Ground truth (correct change) {_GT_LEGEND_HTML}</h3>\n"
        f"{tile}\n"
        "</div>"
    )


def patch_html(doc: str) -> tuple[str | None, int]:
    """Return ``(patched_html_or_None, n_columns_swapped)``.

    Swaps every editing fixture's GT column for the answer-key turntable.
    Returns ``None`` when nothing matched (already patched, or no editing
    fixtures), so the caller skips the re-upload.
    """
    count = 0

    def repl(m: re.Match) -> str:
        nonlocal count
        fx = _FIXTURE_RE.search(m.group("body"))
        if not fx:
            return m.group(0)  # no GT render to read the fixture id from; leave
        count += 1
        return _new_gt_col(fx.group(1))

    new = _GT_COL_RE.sub(repl, doc)
    if count == 0 or new == doc:
        return None, 0
    return new, count


def _self_check() -> None:
    """Fail loudly if the live generator's GT answer-key markup has drifted."""
    try:
        from cadgenbench.eval.report import single_run as sr  # noqa: PLC0415
    except Exception as e:  # noqa: BLE001 - generator not importable here
        logger.info("  (skipped self-check: cadgenbench not importable: %s)", e)
        return
    live_legend = sr._legend_html(sr._GT_EDIT_DIFF_LEGEND)
    if live_legend != _GT_LEGEND_HTML:
        raise SystemExit(
            "GT legend drift: backfill string no longer matches single_run.\n"
            f"  live: {live_legend}\n  here: {_GT_LEGEND_HTML}"
        )
    # The tile markup must match too (build one for a sample fixture id).
    live_tile = sr._render_gt_edit_diff(None, base_url="/gt/999")
    expected = _new_gt_col("999")
    if live_tile not in expected:
        raise SystemExit(
            "GT tile drift: backfill markup no longer matches single_run.\n"
            f"  live: {live_tile}\n  here: {expected}"
        )
    logger.info("  self-check OK (markup matches live generator)")


def main() -> int:
    logging.basicConfig(level=logging.INFO, format="%(message)s")
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--apply", action="store_true",
        help="Re-upload patched reports (default is a dry run).",
    )
    args = parser.parse_args()

    _self_check()

    rows = _load_rows_from_hub()
    targets = [
        r for r in rows
        if r.get("submission_id")
        and _report_relative_url(
            r.get("submission_id"), r.get("status"), r.get("submission_sha256"),
        )
    ]
    logger.info("Found %d report(s) to consider.", len(targets))

    ops: list[CommitOperationAdd] = []
    skipped = 0
    for row in targets:
        sid = row["submission_id"]
        try:
            local = hf_hub_download(
                repo_id=HF_SUBMISSIONS_REPO,
                repo_type="dataset",
                filename=f"{REPORTS_DIR}/{sid}.html",
            )
        except Exception as e:  # noqa: BLE001
            logger.warning("  skip %s: could not fetch report (%s)", sid, e)
            skipped += 1
            continue
        doc = Path(local).read_text(encoding="utf-8")
        patched, n = patch_html(doc)
        if patched is None:
            logger.info("  unchanged %s", sid)
            skipped += 1
            continue
        logger.info("  patched   %s (%d editing GT column(s))", sid, n)
        ops.append(
            CommitOperationAdd(
                path_in_repo=f"{REPORTS_DIR}/{sid}.html",
                path_or_fileobj=patched.encode("utf-8"),
            )
        )

    logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped)
    if not ops:
        logger.info("Nothing to do.")
        return 0
    if not args.apply:
        logger.info("Dry run -- re-run with --apply to upload.")
        return 0

    HfApi().create_commit(
        repo_id=HF_SUBMISSIONS_REPO,
        repo_type="dataset",
        operations=ops,
        commit_message="reports: show GT answer-key edit diff for editing fixtures",
    )
    logger.info("Uploaded %d patched report(s).", len(ops))
    return 0


if __name__ == "__main__":
    sys.exit(main())