File size: 7,946 Bytes
b885669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env python3
"""Backfill the GT "answer key" edit diff into existing reports' GT column.

The report generator now renders, for **editing** fixtures, the Ground Truth
column as the GT answer-key edit-diff turntable (blue = the correct change vs
the input) instead of the four static orthographic views -- see ``cadgenbench``
``eval/report/single_run.py`` (``_render_gt_edit_diff``). The webp is a property
of the data revision and already lives in the GT dataset at
``<fixture>/renders/edit_diff_gt.webp`` (rendered once by
``tools/generate_gt_edit_diff.py``), served through the existing ``/gt`` proxy.

Reports published before that change still show the four GT views. Rather than
re-running ``evaluate`` (the metrics are unchanged), this one-off tool patches
the already-stored ``reports/<id>.html`` files in place: for every editing
fixture card (identified by the following "Output vs ground truth (edit diff)"
column) it swaps the GT column's heading + body for the answer-key turntable,
then re-uploads.

Idempotent: a patched card's heading is "Ground truth (correct change)", so the
``<h3>Ground Truth</h3>`` anchor is gone and re-running is a no-op. Generation
fixtures (no edit-diff column following) are left untouched.

Usage (dry-run lists what would change; nothing is written)::

    python tools/backfill_report_gt_edit_diff.py

    # actually patch + re-upload (needs a write-scoped HF_TOKEN):
    python tools/backfill_report_gt_edit_diff.py --apply
"""
from __future__ import annotations

import argparse
import logging
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download  # noqa: E402

from leaderboard import (  # noqa: E402
    HF_SUBMISSIONS_REPO,
    _load_rows_from_hub,
    _report_relative_url,
)
from submit import REPORTS_DIR  # noqa: E402

logger = logging.getLogger(__name__)

# Blue answer-key legend, byte-for-byte what _legend_html(_GT_EDIT_DIFF_LEGEND)
# emits in single_run.py (verified against the live generator in main() when
# importable) so a patched report is indistinguishable from a fresh one.
_GT_LEGEND_HTML = (
    '<span class="legend">'
    '<span class="legend-chip" style="background:#2173f5"></span>'
    "correct change (ground truth)"
    "</span>"
)

# An editing fixture's GT column: the "<div class='col'>" wrapping the
# "Ground Truth" heading and its four view tiles, up to the column-closing
# "</div>" that sits immediately before the "Output vs ground truth (edit diff)"
# column. The lookahead is what restricts the swap to editing fixtures (a
# generation fixture's GT column is followed by "Output (aligned)" instead).
#
# The body is a *tempered* match: it may contain the nested ``<div class="images">``
# / ``<div class="view">`` tiles but must NOT cross another ``<h3>`` or
# ``<div class="col">`` -- otherwise the lazy ``.*?`` would, on a *generation*
# fixture, run forward across many cards until it found the next editing
# fixture's edit-diff column and swallow every GT column in between.
_GT_COL_RE = re.compile(
    r'<div class="col">\s*<h3>Ground Truth</h3>'
    r'(?P<body>(?:(?!<h3>|<div class="col">).)*?)</div>'
    r'(?=\s*<div class="col">\s*<h3>Output vs ground truth \(edit diff\))',
    re.DOTALL,
)
_FIXTURE_RE = re.compile(r"/gt/([^/\"]+)/renders/")


def _new_gt_col(fixture: str) -> str:
    """The replacement GT column, matching single_run._render_gt_edit_diff."""
    tile = (
        '<div class="images">'
        '<div class="view"><span class="imgwrap">'
        f'<img src="/gt/{fixture}/renders/edit_diff_gt.webp" '
        'alt="ground truth (edit diff)" class="zoomable" loading="lazy">'
        "</span><span>correct change</span></div>"
        "</div>"
    )
    return (
        '<div class="col">\n'
        f"<h3>Ground truth (correct change) {_GT_LEGEND_HTML}</h3>\n"
        f"{tile}\n"
        "</div>"
    )


def patch_html(doc: str) -> tuple[str | None, int]:
    """Return ``(patched_html_or_None, n_columns_swapped)``.

    Swaps every editing fixture's GT column for the answer-key turntable.
    Returns ``None`` when nothing matched (already patched, or no editing
    fixtures), so the caller skips the re-upload.
    """
    count = 0

    def repl(m: re.Match) -> str:
        nonlocal count
        fx = _FIXTURE_RE.search(m.group("body"))
        if not fx:
            return m.group(0)  # no GT render to read the fixture id from; leave
        count += 1
        return _new_gt_col(fx.group(1))

    new = _GT_COL_RE.sub(repl, doc)
    if count == 0 or new == doc:
        return None, 0
    return new, count


def _self_check() -> None:
    """Fail loudly if the live generator's GT answer-key markup has drifted."""
    try:
        from cadgenbench.eval.report import single_run as sr  # noqa: PLC0415
    except Exception as e:  # noqa: BLE001 - generator not importable here
        logger.info("  (skipped self-check: cadgenbench not importable: %s)", e)
        return
    live_legend = sr._legend_html(sr._GT_EDIT_DIFF_LEGEND)
    if live_legend != _GT_LEGEND_HTML:
        raise SystemExit(
            "GT legend drift: backfill string no longer matches single_run.\n"
            f"  live: {live_legend}\n  here: {_GT_LEGEND_HTML}"
        )
    # The tile markup must match too (build one for a sample fixture id).
    live_tile = sr._render_gt_edit_diff(None, base_url="/gt/999")
    expected = _new_gt_col("999")
    if live_tile not in expected:
        raise SystemExit(
            "GT tile drift: backfill markup no longer matches single_run.\n"
            f"  live: {live_tile}\n  here: {expected}"
        )
    logger.info("  self-check OK (markup matches live generator)")


def main() -> int:
    logging.basicConfig(level=logging.INFO, format="%(message)s")
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--apply", action="store_true",
        help="Re-upload patched reports (default is a dry run).",
    )
    args = parser.parse_args()

    _self_check()

    rows = _load_rows_from_hub()
    targets = [
        r for r in rows
        if r.get("submission_id")
        and _report_relative_url(
            r.get("submission_id"), r.get("status"), r.get("submission_sha256"),
        )
    ]
    logger.info("Found %d report(s) to consider.", len(targets))

    ops: list[CommitOperationAdd] = []
    skipped = 0
    for row in targets:
        sid = row["submission_id"]
        try:
            local = hf_hub_download(
                repo_id=HF_SUBMISSIONS_REPO,
                repo_type="dataset",
                filename=f"{REPORTS_DIR}/{sid}.html",
            )
        except Exception as e:  # noqa: BLE001
            logger.warning("  skip %s: could not fetch report (%s)", sid, e)
            skipped += 1
            continue
        doc = Path(local).read_text(encoding="utf-8")
        patched, n = patch_html(doc)
        if patched is None:
            logger.info("  unchanged %s", sid)
            skipped += 1
            continue
        logger.info("  patched   %s (%d editing GT column(s))", sid, n)
        ops.append(
            CommitOperationAdd(
                path_in_repo=f"{REPORTS_DIR}/{sid}.html",
                path_or_fileobj=patched.encode("utf-8"),
            )
        )

    logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped)
    if not ops:
        logger.info("Nothing to do.")
        return 0
    if not args.apply:
        logger.info("Dry run -- re-run with --apply to upload.")
        return 0

    HfApi().create_commit(
        repo_id=HF_SUBMISSIONS_REPO,
        repo_type="dataset",
        operations=ops,
        commit_message="reports: show GT answer-key edit diff for editing fixtures",
    )
    logger.info("Uploaded %d patched report(s).", len(ops))
    return 0


if __name__ == "__main__":
    sys.exit(main())