Michael Rabinovich Cursor commited on
Commit
b885669
·
1 Parent(s): 49e27be

Bump CADGENBENCH_SHA to 037eade; add GT edit-diff report backfill tool

Browse files

Pins the leaderboard image to the cadgenbench commit that renders the GT
answer-key edit-diff report column. Adds tools/backfill_report_gt_edit_diff.py,
the one-off (idempotent) patcher that swaps existing reports' editing GT
columns to the answer-key turntable without re-evaluating.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (2) hide show
  1. Dockerfile +1 -1
  2. tools/backfill_report_gt_edit_diff.py +214 -0
Dockerfile CHANGED
@@ -41,7 +41,7 @@ RUN pip install --no-cache-dir -r /tmp/requirements.txt \
41
  # image rebuild picks up the latest code (pre-v1: always-updated). Lock
42
  # to a specific commit SHA at the v1 release so published scores are
43
  # reproducible (see space-setup/post-gt-swap.md Stage F).
44
- ARG CADGENBENCH_SHA=ee8b257
45
  # Cache-bust the install below whenever the tracked ref moves: the
46
  # GitHub commits endpoint's response changes with each new commit on
47
  # `main`, so BuildKit re-fetches and invalidates the cached pip layer.
 
41
  # image rebuild picks up the latest code (pre-v1: always-updated). Lock
42
  # to a specific commit SHA at the v1 release so published scores are
43
  # reproducible (see space-setup/post-gt-swap.md Stage F).
44
+ ARG CADGENBENCH_SHA=037eade
45
  # Cache-bust the install below whenever the tracked ref moves: the
46
  # GitHub commits endpoint's response changes with each new commit on
47
  # `main`, so BuildKit re-fetches and invalidates the cached pip layer.
tools/backfill_report_gt_edit_diff.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Backfill the GT "answer key" edit diff into existing reports' GT column.
3
+
4
+ The report generator now renders, for **editing** fixtures, the Ground Truth
5
+ column as the GT answer-key edit-diff turntable (blue = the correct change vs
6
+ the input) instead of the four static orthographic views -- see ``cadgenbench``
7
+ ``eval/report/single_run.py`` (``_render_gt_edit_diff``). The webp is a property
8
+ of the data revision and already lives in the GT dataset at
9
+ ``<fixture>/renders/edit_diff_gt.webp`` (rendered once by
10
+ ``tools/generate_gt_edit_diff.py``), served through the existing ``/gt`` proxy.
11
+
12
+ Reports published before that change still show the four GT views. Rather than
13
+ re-running ``evaluate`` (the metrics are unchanged), this one-off tool patches
14
+ the already-stored ``reports/<id>.html`` files in place: for every editing
15
+ fixture card (identified by the following "Output vs ground truth (edit diff)"
16
+ column) it swaps the GT column's heading + body for the answer-key turntable,
17
+ then re-uploads.
18
+
19
+ Idempotent: a patched card's heading is "Ground truth (correct change)", so the
20
+ ``<h3>Ground Truth</h3>`` anchor is gone and re-running is a no-op. Generation
21
+ fixtures (no edit-diff column following) are left untouched.
22
+
23
+ Usage (dry-run lists what would change; nothing is written)::
24
+
25
+ python tools/backfill_report_gt_edit_diff.py
26
+
27
+ # actually patch + re-upload (needs a write-scoped HF_TOKEN):
28
+ python tools/backfill_report_gt_edit_diff.py --apply
29
+ """
30
+ from __future__ import annotations
31
+
32
+ import argparse
33
+ import logging
34
+ import re
35
+ import sys
36
+ from pathlib import Path
37
+
38
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
39
+
40
+ from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download # noqa: E402
41
+
42
+ from leaderboard import ( # noqa: E402
43
+ HF_SUBMISSIONS_REPO,
44
+ _load_rows_from_hub,
45
+ _report_relative_url,
46
+ )
47
+ from submit import REPORTS_DIR # noqa: E402
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+ # Blue answer-key legend, byte-for-byte what _legend_html(_GT_EDIT_DIFF_LEGEND)
52
+ # emits in single_run.py (verified against the live generator in main() when
53
+ # importable) so a patched report is indistinguishable from a fresh one.
54
+ _GT_LEGEND_HTML = (
55
+ '<span class="legend">'
56
+ '<span class="legend-chip" style="background:#2173f5"></span>'
57
+ "correct change (ground truth)"
58
+ "</span>"
59
+ )
60
+
61
+ # An editing fixture's GT column: the "<div class='col'>" wrapping the
62
+ # "Ground Truth" heading and its four view tiles, up to the column-closing
63
+ # "</div>" that sits immediately before the "Output vs ground truth (edit diff)"
64
+ # column. The lookahead is what restricts the swap to editing fixtures (a
65
+ # generation fixture's GT column is followed by "Output (aligned)" instead).
66
+ #
67
+ # The body is a *tempered* match: it may contain the nested ``<div class="images">``
68
+ # / ``<div class="view">`` tiles but must NOT cross another ``<h3>`` or
69
+ # ``<div class="col">`` -- otherwise the lazy ``.*?`` would, on a *generation*
70
+ # fixture, run forward across many cards until it found the next editing
71
+ # fixture's edit-diff column and swallow every GT column in between.
72
+ _GT_COL_RE = re.compile(
73
+ r'<div class="col">\s*<h3>Ground Truth</h3>'
74
+ r'(?P<body>(?:(?!<h3>|<div class="col">).)*?)</div>'
75
+ r'(?=\s*<div class="col">\s*<h3>Output vs ground truth \(edit diff\))',
76
+ re.DOTALL,
77
+ )
78
+ _FIXTURE_RE = re.compile(r"/gt/([^/\"]+)/renders/")
79
+
80
+
81
+ def _new_gt_col(fixture: str) -> str:
82
+ """The replacement GT column, matching single_run._render_gt_edit_diff."""
83
+ tile = (
84
+ '<div class="images">'
85
+ '<div class="view"><span class="imgwrap">'
86
+ f'<img src="/gt/{fixture}/renders/edit_diff_gt.webp" '
87
+ 'alt="ground truth (edit diff)" class="zoomable" loading="lazy">'
88
+ "</span><span>correct change</span></div>"
89
+ "</div>"
90
+ )
91
+ return (
92
+ '<div class="col">\n'
93
+ f"<h3>Ground truth (correct change) {_GT_LEGEND_HTML}</h3>\n"
94
+ f"{tile}\n"
95
+ "</div>"
96
+ )
97
+
98
+
99
+ def patch_html(doc: str) -> tuple[str | None, int]:
100
+ """Return ``(patched_html_or_None, n_columns_swapped)``.
101
+
102
+ Swaps every editing fixture's GT column for the answer-key turntable.
103
+ Returns ``None`` when nothing matched (already patched, or no editing
104
+ fixtures), so the caller skips the re-upload.
105
+ """
106
+ count = 0
107
+
108
+ def repl(m: re.Match) -> str:
109
+ nonlocal count
110
+ fx = _FIXTURE_RE.search(m.group("body"))
111
+ if not fx:
112
+ return m.group(0) # no GT render to read the fixture id from; leave
113
+ count += 1
114
+ return _new_gt_col(fx.group(1))
115
+
116
+ new = _GT_COL_RE.sub(repl, doc)
117
+ if count == 0 or new == doc:
118
+ return None, 0
119
+ return new, count
120
+
121
+
122
+ def _self_check() -> None:
123
+ """Fail loudly if the live generator's GT answer-key markup has drifted."""
124
+ try:
125
+ from cadgenbench.eval.report import single_run as sr # noqa: PLC0415
126
+ except Exception as e: # noqa: BLE001 - generator not importable here
127
+ logger.info(" (skipped self-check: cadgenbench not importable: %s)", e)
128
+ return
129
+ live_legend = sr._legend_html(sr._GT_EDIT_DIFF_LEGEND)
130
+ if live_legend != _GT_LEGEND_HTML:
131
+ raise SystemExit(
132
+ "GT legend drift: backfill string no longer matches single_run.\n"
133
+ f" live: {live_legend}\n here: {_GT_LEGEND_HTML}"
134
+ )
135
+ # The tile markup must match too (build one for a sample fixture id).
136
+ live_tile = sr._render_gt_edit_diff(None, base_url="/gt/999")
137
+ expected = _new_gt_col("999")
138
+ if live_tile not in expected:
139
+ raise SystemExit(
140
+ "GT tile drift: backfill markup no longer matches single_run.\n"
141
+ f" live: {live_tile}\n here: {expected}"
142
+ )
143
+ logger.info(" self-check OK (markup matches live generator)")
144
+
145
+
146
+ def main() -> int:
147
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
148
+ parser = argparse.ArgumentParser(description=__doc__)
149
+ parser.add_argument(
150
+ "--apply", action="store_true",
151
+ help="Re-upload patched reports (default is a dry run).",
152
+ )
153
+ args = parser.parse_args()
154
+
155
+ _self_check()
156
+
157
+ rows = _load_rows_from_hub()
158
+ targets = [
159
+ r for r in rows
160
+ if r.get("submission_id")
161
+ and _report_relative_url(
162
+ r.get("submission_id"), r.get("status"), r.get("submission_sha256"),
163
+ )
164
+ ]
165
+ logger.info("Found %d report(s) to consider.", len(targets))
166
+
167
+ ops: list[CommitOperationAdd] = []
168
+ skipped = 0
169
+ for row in targets:
170
+ sid = row["submission_id"]
171
+ try:
172
+ local = hf_hub_download(
173
+ repo_id=HF_SUBMISSIONS_REPO,
174
+ repo_type="dataset",
175
+ filename=f"{REPORTS_DIR}/{sid}.html",
176
+ )
177
+ except Exception as e: # noqa: BLE001
178
+ logger.warning(" skip %s: could not fetch report (%s)", sid, e)
179
+ skipped += 1
180
+ continue
181
+ doc = Path(local).read_text(encoding="utf-8")
182
+ patched, n = patch_html(doc)
183
+ if patched is None:
184
+ logger.info(" unchanged %s", sid)
185
+ skipped += 1
186
+ continue
187
+ logger.info(" patched %s (%d editing GT column(s))", sid, n)
188
+ ops.append(
189
+ CommitOperationAdd(
190
+ path_in_repo=f"{REPORTS_DIR}/{sid}.html",
191
+ path_or_fileobj=patched.encode("utf-8"),
192
+ )
193
+ )
194
+
195
+ logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped)
196
+ if not ops:
197
+ logger.info("Nothing to do.")
198
+ return 0
199
+ if not args.apply:
200
+ logger.info("Dry run -- re-run with --apply to upload.")
201
+ return 0
202
+
203
+ HfApi().create_commit(
204
+ repo_id=HF_SUBMISSIONS_REPO,
205
+ repo_type="dataset",
206
+ operations=ops,
207
+ commit_message="reports: show GT answer-key edit diff for editing fixtures",
208
+ )
209
+ logger.info("Uploaded %d patched report(s).", len(ops))
210
+ return 0
211
+
212
+
213
+ if __name__ == "__main__":
214
+ sys.exit(main())