CADGenBench / tools /backfill_edit_diff_still.py
Michael Rabinovich
leaderboard: backfill tool for edit_diff.png stills in render bucket
c7f83a5
#!/usr/bin/env python3
"""Backfill the static edit-diff still (``edit_diff.png``) into the render bucket.
Editing samples ship an *animated* ``edit_diff.webp`` turntable but no static
frame. The grid thumbnail needs a still (an animated WebP can't be frozen to one
angle in HTML), so the eval pipeline now also writes ``edit_diff.png`` (frame 0)
beside the clip — but submissions evaluated *before* that change only have the
WebP in the bucket.
This one-time tool closes that gap **without re-evaluating or re-rendering**: it
lists the public render bucket, finds every ``.../<fixture>/edit_diff.webp`` that
has no sibling ``edit_diff.png``, downloads the WebP, extracts frame 0 via the
shared :func:`cadgenbench.common.viewer.first_frame_png` (the exact frame the
forward pipeline now saves), and uploads ``edit_diff.png`` next to it in the same
bucket prefix — so it serves by the same anonymous render URL as every other
render.
Run (needs a write-scoped ``HF_TOKEN`` for the bucket)::
HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py # all submissions
HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py --submission <id>
python tools/backfill_edit_diff_still.py --dry-run # list only, no token needed
"""
from __future__ import annotations
import argparse
import os
import sys
import tempfile
import urllib.request
from pathlib import Path
from huggingface_hub import HfApi
# cadgenbench (for the shared frame extractor) must be importable; allow running
# straight from the repo without installing the leaderboard package.
_REPO_ROOT = Path(__file__).resolve().parents[2]
_SRC = _REPO_ROOT / "cadgenbench" / "src"
if _SRC.is_dir():
sys.path.insert(0, str(_SRC))
from cadgenbench.common.imaging import first_frame_png # noqa: E402
HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co").rstrip("/")
HF_ORG = os.getenv("HF_ORG", "HuggingAI4Engineering")
# Mirror leaderboard.py's defaults so the tool targets the same bucket the eval
# job uploads to and the report/gallery read from.
RENDER_BUCKET = os.getenv("HF_RENDER_BUCKET", f"{HF_ORG}/cadgenbench-eval-staging")
RENDER_PREFIX = "renders"
WEBP_NAME = "edit_diff.webp"
PNG_NAME = "edit_diff.png"
# One upload call per this many files: keeps an individual batch small and
# rate-limit friendly while still amortising the request overhead.
UPLOAD_CHUNK = 50
def _resolve_url(path: str) -> str:
"""Anonymous bucket resolve URL for a bucket-relative object path."""
return f"{HF_ENDPOINT}/buckets/{RENDER_BUCKET}/resolve/{path}"
def _download(path: str, token: str | None) -> bytes:
req = urllib.request.Request(_resolve_url(path))
if token:
req.add_header("Authorization", f"Bearer {token}")
with urllib.request.urlopen(req, timeout=60) as resp:
return resp.read()
def _list_entries(api: HfApi, prefix: str, token: str | None) -> list[str]:
"""Bucket-relative file paths under *prefix* (folders filtered out)."""
return [
e.path
for e in api.list_bucket_tree(
RENDER_BUCKET, prefix=prefix, recursive=True, token=token,
)
if getattr(e, "path", None)
and not getattr(e, "is_folder", False)
and not e.path.endswith("/")
]
def _missing_stills(paths: list[str]) -> list[str]:
"""WebP paths whose sibling ``edit_diff.png`` is absent from the bucket."""
present = set(paths)
out = []
for p in paths:
if p.endswith("/" + WEBP_NAME):
sibling = p[: -len(WEBP_NAME)] + PNG_NAME
if sibling not in present:
out.append(p)
return sorted(out)
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--submission",
help="Limit to one submission id (the prefix is renders/<id>). "
"Omit to scan every submission in the bucket.",
)
parser.add_argument("--limit", type=int, default=None,
help="Process at most N stills (after listing).")
parser.add_argument(
"--dry-run", action="store_true",
help="List what would be created; download/upload nothing.",
)
args = parser.parse_args()
token = os.environ.get("HF_TOKEN")
api = HfApi(token=token)
prefix = (
f"{RENDER_PREFIX}/{args.submission}" if args.submission else RENDER_PREFIX
)
print(f"Scanning bucket {RENDER_BUCKET} under {prefix}/ …", flush=True)
paths = _list_entries(api, prefix, token)
todo = _missing_stills(paths)
if args.limit is not None:
todo = todo[: args.limit]
n_webp = sum(1 for p in paths if p.endswith("/" + WEBP_NAME))
print(
f"Found {n_webp} edit_diff.webp; {len(todo)} missing a still.",
flush=True,
)
if not todo:
print("Nothing to backfill.")
return 0
if args.dry_run:
for p in todo:
print(" would create", p[: -len(WEBP_NAME)] + PNG_NAME)
return 0
if not token:
parser.error("HF_TOKEN required to upload (or pass --dry-run).")
created = 0
with tempfile.TemporaryDirectory(prefix="edit-diff-still-") as tmp:
tmp_dir = Path(tmp)
add: list[tuple[str, str]] = []
for i, webp_path in enumerate(todo, start=1):
dest = webp_path[: -len(WEBP_NAME)] + PNG_NAME
try:
png = first_frame_png(_download(webp_path, token))
except Exception as e: # noqa: BLE001 - skip a bad clip, keep going
print(f" [{i}/{len(todo)}] SKIP {webp_path} ({type(e).__name__}: {e})")
continue
local = tmp_dir / f"{i}.png"
local.write_bytes(png)
add.append((str(local), dest))
print(f" [{i}/{len(todo)}] {dest} ({len(png) // 1024} KB)", flush=True)
if len(add) >= UPLOAD_CHUNK:
api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
created += len(add)
add = []
if add:
api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
created += len(add)
print(f"Done. Uploaded {created} edit_diff.png still(s).", flush=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())