CADGenBench / tools /backfill_report_legends.py
Michael Rabinovich
leaderboard: bump cadgenbench to 1010043 + legend backfill tool
1a24d4a
#!/usr/bin/env python3
"""Backfill the corrected interface/edit-diff legends into existing reports.
The report generator now renders color-chip legends that match the render
palettes (see ``cadgenbench`` ``eval/report/single_run.py``):
- **Interface overlay**: the old legend omitted the dominant blue (the part
itself) and used vague "free/filled sub-volumes" wording. The new legend is
``your part / keep-out (must stay empty) / keep-in (must be filled) /
disagreement`` with matching chips.
- **Edit diff** (editing fixtures): previously had no legend; the new one is
``your output / extra material vs GT / missing material vs GT``.
Reports published before that change still carry the old/absent legends. Rather
than re-running ``evaluate``, this one-off tool patches the already-stored
``reports/<id>.html`` files in place: it swaps the old interface legend, adds
the edit-diff legend, and injects the chip CSS, then re-uploads.
Idempotent: a report already carrying the new chips (``.legend-chip``) has its
interface/edit anchors absent, so re-running is a no-op.
Usage (dry-run lists what would change; nothing is written)::
python tools/backfill_report_legends.py
# actually patch + re-upload (needs a write-scoped HF_TOKEN):
python tools/backfill_report_legends.py --apply
"""
from __future__ import annotations
import argparse
import logging
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download # noqa: E402
from leaderboard import ( # noqa: E402
HF_SUBMISSIONS_REPO,
_load_rows_from_hub,
_report_relative_url,
)
from submit import REPORTS_DIR # noqa: E402
logger = logging.getLogger(__name__)
# These three strings MUST stay byte-for-byte identical to what
# single_run.py emits (_legend_html(_IFACE_LEGEND) / _legend_html(
# _EDIT_DIFF_LEGEND) and the .legend/.legend-chip CSS) so a patched report is
# indistinguishable from a freshly-generated one. A self-check in main()
# compares against the live generator when it is importable.
_IFACE_LEGEND_HTML = (
'<span class="legend">'
'<span class="legend-chip" style="background:#2e73db"></span>your part'
'<span class="legend-chip" style="background:#e64d4d"></span>'
"keep-out (must stay empty)"
'<span class="legend-chip" style="background:#33b34d"></span>'
"keep-in (must be filled)"
'<span class="legend-chip" style="background:#ffd900"></span>disagreement'
"</span>"
)
_EDIT_LEGEND_HTML = (
'<span class="legend">'
'<span class="legend-chip" style="background:#bdc4d1"></span>your output'
'<span class="legend-chip" style="background:#2173f5"></span>'
"extra material vs GT"
'<span class="legend-chip" style="background:#e62929"></span>'
"missing material vs GT"
"</span>"
)
_CSS_BLOCK = (
"\n.legend { color: #6b7785; font-size: 0.78em; font-weight: 400; "
"text-transform: none; letter-spacing: normal; line-height: 1.6; }\n"
".legend-chip { display: inline-block; width: 11px; height: 11px; "
"border-radius: 3px; vertical-align: middle; "
"margin: 0 5px 0 14px; border: 1px solid rgba(0,0,0,0.18); }\n"
)
# Old interface legend span (any per-fixture occurrence).
_OLD_IFACE_RE = re.compile(
r"<span class='iface-overlay-legend'>.*?</span>",
re.DOTALL,
)
# Bare edit-diff heading (no legend yet).
_BARE_EDIT_H3 = "<h3>Output vs ground truth (edit diff)</h3>"
_NEW_EDIT_H3 = f"<h3>Output vs ground truth (edit diff) {_EDIT_LEGEND_HTML}</h3>"
def patch_html(doc: str) -> str | None:
"""Return the patched HTML, or ``None`` when nothing needs changing.
Swaps the old interface legend, adds the edit-diff legend, and injects the
chip CSS. Idempotent: each sub-edit's anchor disappears once applied.
"""
new = _OLD_IFACE_RE.sub(lambda _m: _IFACE_LEGEND_HTML, doc)
new = new.replace(_BARE_EDIT_H3, _NEW_EDIT_H3)
changed = new != doc
if changed and ".legend-chip" not in new and "</style>" in new:
new = new.replace("</style>", _CSS_BLOCK + "</style>", 1)
return new if changed else None
def main() -> int:
logging.basicConfig(level=logging.INFO, format="%(message)s")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--apply", action="store_true",
help="Re-upload patched reports (default is a dry run).",
)
args = parser.parse_args()
rows = _load_rows_from_hub()
targets = [
r for r in rows
if r.get("submission_id")
and _report_relative_url(
r.get("submission_id"), r.get("status"), r.get("submission_sha256"),
)
]
logger.info("Found %d report(s) to consider.", len(targets))
ops: list[CommitOperationAdd] = []
skipped = 0
for row in targets:
sid = row["submission_id"]
try:
local = hf_hub_download(
repo_id=HF_SUBMISSIONS_REPO,
repo_type="dataset",
filename=f"{REPORTS_DIR}/{sid}.html",
)
except Exception as e: # noqa: BLE001
logger.warning(" skip %s: could not fetch report (%s)", sid, e)
skipped += 1
continue
doc = Path(local).read_text(encoding="utf-8")
patched = patch_html(doc)
if patched is None:
logger.info(" unchanged %s", sid)
skipped += 1
continue
logger.info(" patched %s", sid)
ops.append(
CommitOperationAdd(
path_in_repo=f"{REPORTS_DIR}/{sid}.html",
path_or_fileobj=patched.encode("utf-8"),
)
)
logger.info("%d to patch, %d unchanged/skipped.", len(ops), skipped)
if not ops:
logger.info("Nothing to do.")
return 0
if not args.apply:
logger.info("Dry run -- re-run with --apply to upload.")
return 0
HfApi().create_commit(
repo_id=HF_SUBMISSIONS_REPO,
repo_type="dataset",
operations=ops,
commit_message="reports: backfill corrected interface + edit-diff legends",
)
logger.info("Uploaded %d patched report(s).", len(ops))
return 0
if __name__ == "__main__":
sys.exit(main())