Michael Rabinovich Cursor commited on
Commit
1a24d4a
·
1 Parent(s): 95f3ee8

leaderboard: bump cadgenbench to 1010043 + legend backfill tool

Browse files

Pin to the report generator with color-chip interface/edit-diff legends, and
add tools/backfill_report_legends.py to retro-patch the corrected legends into
already-published reports/<id>.html in place (no re-eval).

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (2) hide show
  1. Dockerfile +1 -1
  2. tools/backfill_report_legends.py +172 -0
Dockerfile CHANGED
@@ -41,7 +41,7 @@ RUN pip install --no-cache-dir -r /tmp/requirements.txt \
41
  # image rebuild picks up the latest code (pre-v1: always-updated). Lock
42
  # to a specific commit SHA at the v1 release so published scores are
43
  # reproducible (see space-setup/post-gt-swap.md Stage F).
44
- ARG CADGENBENCH_SHA=0c7690e
45
  # Cache-bust the install below whenever the tracked ref moves: the
46
  # GitHub commits endpoint's response changes with each new commit on
47
  # `main`, so BuildKit re-fetches and invalidates the cached pip layer.
 
41
  # image rebuild picks up the latest code (pre-v1: always-updated). Lock
42
  # to a specific commit SHA at the v1 release so published scores are
43
  # reproducible (see space-setup/post-gt-swap.md Stage F).
44
+ ARG CADGENBENCH_SHA=1010043
45
  # Cache-bust the install below whenever the tracked ref moves: the
46
  # GitHub commits endpoint's response changes with each new commit on
47
  # `main`, so BuildKit re-fetches and invalidates the cached pip layer.
tools/backfill_report_legends.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Backfill the corrected interface/edit-diff legends into existing reports.
3
+
4
+ The report generator now renders color-chip legends that match the render
5
+ palettes (see ``cadgenbench`` ``eval/report/single_run.py``):
6
+
7
+ - **Interface overlay**: the old legend omitted the dominant blue (the part
8
+ itself) and used vague "free/filled sub-volumes" wording. The new legend is
9
+ ``your part / keep-out (must stay empty) / keep-in (must be filled) /
10
+ disagreement`` with matching chips.
11
+ - **Edit diff** (editing fixtures): previously had no legend; the new one is
12
+ ``your output / extra material vs GT / missing material vs GT``.
13
+
14
+ Reports published before that change still carry the old/absent legends. Rather
15
+ than re-running ``evaluate``, this one-off tool patches the already-stored
16
+ ``reports/<id>.html`` files in place: it swaps the old interface legend, adds
17
+ the edit-diff legend, and injects the chip CSS, then re-uploads.
18
+
19
+ Idempotent: a report already carrying the new chips (``.legend-chip``) has its
20
+ interface/edit anchors absent, so re-running is a no-op.
21
+
22
+ Usage (dry-run lists what would change; nothing is written)::
23
+
24
+ python tools/backfill_report_legends.py
25
+
26
+ # actually patch + re-upload (needs a write-scoped HF_TOKEN):
27
+ python tools/backfill_report_legends.py --apply
28
+ """
29
+ from __future__ import annotations
30
+
31
+ import argparse
32
+ import logging
33
+ import re
34
+ import sys
35
+ from pathlib import Path
36
+
37
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
38
+
39
+ from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download # noqa: E402
40
+
41
+ from leaderboard import ( # noqa: E402
42
+ HF_SUBMISSIONS_REPO,
43
+ _load_rows_from_hub,
44
+ _report_relative_url,
45
+ )
46
+ from submit import REPORTS_DIR # noqa: E402
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+ # These three strings MUST stay byte-for-byte identical to what
51
+ # single_run.py emits (_legend_html(_IFACE_LEGEND) / _legend_html(
52
+ # _EDIT_DIFF_LEGEND) and the .legend/.legend-chip CSS) so a patched report is
53
+ # indistinguishable from a freshly-generated one. A self-check in main()
54
+ # compares against the live generator when it is importable.
55
+ _IFACE_LEGEND_HTML = (
56
+ '<span class="legend">'
57
+ '<span class="legend-chip" style="background:#2e73db"></span>your part'
58
+ '<span class="legend-chip" style="background:#e64d4d"></span>'
59
+ "keep-out (must stay empty)"
60
+ '<span class="legend-chip" style="background:#33b34d"></span>'
61
+ "keep-in (must be filled)"
62
+ '<span class="legend-chip" style="background:#ffd900"></span>disagreement'
63
+ "</span>"
64
+ )
65
+ _EDIT_LEGEND_HTML = (
66
+ '<span class="legend">'
67
+ '<span class="legend-chip" style="background:#bdc4d1"></span>your output'
68
+ '<span class="legend-chip" style="background:#2173f5"></span>'
69
+ "extra material vs GT"
70
+ '<span class="legend-chip" style="background:#e62929"></span>'
71
+ "missing material vs GT"
72
+ "</span>"
73
+ )
74
+ _CSS_BLOCK = (
75
+ "\n.legend { color: #6b7785; font-size: 0.78em; font-weight: 400; "
76
+ "text-transform: none; letter-spacing: normal; line-height: 1.6; }\n"
77
+ ".legend-chip { display: inline-block; width: 11px; height: 11px; "
78
+ "border-radius: 3px; vertical-align: middle; "
79
+ "margin: 0 5px 0 14px; border: 1px solid rgba(0,0,0,0.18); }\n"
80
+ )
81
+
82
+ # Old interface legend span (any per-fixture occurrence).
83
+ _OLD_IFACE_RE = re.compile(
84
+ r"<span class='iface-overlay-legend'>.*?</span>",
85
+ re.DOTALL,
86
+ )
87
+ # Bare edit-diff heading (no legend yet).
88
+ _BARE_EDIT_H3 = "<h3>Output vs ground truth (edit diff)</h3>"
89
+ _NEW_EDIT_H3 = f"<h3>Output vs ground truth (edit diff) {_EDIT_LEGEND_HTML}</h3>"
90
+
91
+
92
+ def patch_html(doc: str) -> str | None:
93
+ """Return the patched HTML, or ``None`` when nothing needs changing.
94
+
95
+ Swaps the old interface legend, adds the edit-diff legend, and injects the
96
+ chip CSS. Idempotent: each sub-edit's anchor disappears once applied.
97
+ """
98
+ new = _OLD_IFACE_RE.sub(lambda _m: _IFACE_LEGEND_HTML, doc)
99
+ new = new.replace(_BARE_EDIT_H3, _NEW_EDIT_H3)
100
+ changed = new != doc
101
+ if changed and ".legend-chip" not in new and "</style>" in new:
102
+ new = new.replace("</style>", _CSS_BLOCK + "</style>", 1)
103
+ return new if changed else None
104
+
105
+
106
+ def main() -> int:
107
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
108
+ parser = argparse.ArgumentParser(description=__doc__)
109
+ parser.add_argument(
110
+ "--apply", action="store_true",
111
+ help="Re-upload patched reports (default is a dry run).",
112
+ )
113
+ args = parser.parse_args()
114
+
115
+ rows = _load_rows_from_hub()
116
+ targets = [
117
+ r for r in rows
118
+ if r.get("submission_id")
119
+ and _report_relative_url(
120
+ r.get("submission_id"), r.get("status"), r.get("submission_sha256"),
121
+ )
122
+ ]
123
+ logger.info("Found %d report(s) to consider.", len(targets))
124
+
125
+ ops: list[CommitOperationAdd] = []
126
+ skipped = 0
127
+ for row in targets:
128
+ sid = row["submission_id"]
129
+ try:
130
+ local = hf_hub_download(
131
+ repo_id=HF_SUBMISSIONS_REPO,
132
+ repo_type="dataset",
133
+ filename=f"{REPORTS_DIR}/{sid}.html",
134
+ )
135
+ except Exception as e: # noqa: BLE001
136
+ logger.warning(" skip %s: could not fetch report (%s)", sid, e)
137
+ skipped += 1
138
+ continue
139
+ doc = Path(local).read_text(encoding="utf-8")
140
+ patched = patch_html(doc)
141
+ if patched is None:
142
+ logger.info(" unchanged %s", sid)
143
+ skipped += 1
144
+ continue
145
+ logger.info(" patched %s", sid)
146
+ ops.append(
147
+ CommitOperationAdd(
148
+ path_in_repo=f"{REPORTS_DIR}/{sid}.html",
149
+ path_or_fileobj=patched.encode("utf-8"),
150
+ )
151
+ )
152
+
153
+ logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped)
154
+ if not ops:
155
+ logger.info("Nothing to do.")
156
+ return 0
157
+ if not args.apply:
158
+ logger.info("Dry run -- re-run with --apply to upload.")
159
+ return 0
160
+
161
+ HfApi().create_commit(
162
+ repo_id=HF_SUBMISSIONS_REPO,
163
+ repo_type="dataset",
164
+ operations=ops,
165
+ commit_message="reports: backfill corrected interface + edit-diff legends",
166
+ )
167
+ logger.info("Uploaded %d patched report(s).", len(ops))
168
+ return 0
169
+
170
+
171
+ if __name__ == "__main__":
172
+ sys.exit(main())