Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

File size: 6,307 Bytes

c7f83a5

#!/usr/bin/env python3
"""Backfill the static edit-diff still (``edit_diff.png``) into the render bucket.

Editing samples ship an *animated* ``edit_diff.webp`` turntable but no static
frame. The grid thumbnail needs a still (an animated WebP can't be frozen to one
angle in HTML), so the eval pipeline now also writes ``edit_diff.png`` (frame 0)
beside the clip — but submissions evaluated *before* that change only have the
WebP in the bucket.

This one-time tool closes that gap **without re-evaluating or re-rendering**: it
lists the public render bucket, finds every ``.../<fixture>/edit_diff.webp`` that
has no sibling ``edit_diff.png``, downloads the WebP, extracts frame 0 via the
shared :func:`cadgenbench.common.viewer.first_frame_png` (the exact frame the
forward pipeline now saves), and uploads ``edit_diff.png`` next to it in the same
bucket prefix — so it serves by the same anonymous render URL as every other
render.

Run (needs a write-scoped ``HF_TOKEN`` for the bucket)::

    HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py            # all submissions
    HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py --submission <id>
    python tools/backfill_edit_diff_still.py --dry-run                          # list only, no token needed
"""
from __future__ import annotations

import argparse
import os
import sys
import tempfile
import urllib.request
from pathlib import Path

from huggingface_hub import HfApi

# cadgenbench (for the shared frame extractor) must be importable; allow running
# straight from the repo without installing the leaderboard package.
_REPO_ROOT = Path(__file__).resolve().parents[2]
_SRC = _REPO_ROOT / "cadgenbench" / "src"
if _SRC.is_dir():
    sys.path.insert(0, str(_SRC))

from cadgenbench.common.imaging import first_frame_png  # noqa: E402

HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co").rstrip("/")
HF_ORG = os.getenv("HF_ORG", "HuggingAI4Engineering")
# Mirror leaderboard.py's defaults so the tool targets the same bucket the eval
# job uploads to and the report/gallery read from.
RENDER_BUCKET = os.getenv("HF_RENDER_BUCKET", f"{HF_ORG}/cadgenbench-eval-staging")
RENDER_PREFIX = "renders"
WEBP_NAME = "edit_diff.webp"
PNG_NAME = "edit_diff.png"
# One upload call per this many files: keeps an individual batch small and
# rate-limit friendly while still amortising the request overhead.
UPLOAD_CHUNK = 50


def _resolve_url(path: str) -> str:
    """Anonymous bucket resolve URL for a bucket-relative object path."""
    return f"{HF_ENDPOINT}/buckets/{RENDER_BUCKET}/resolve/{path}"


def _download(path: str, token: str | None) -> bytes:
    req = urllib.request.Request(_resolve_url(path))
    if token:
        req.add_header("Authorization", f"Bearer {token}")
    with urllib.request.urlopen(req, timeout=60) as resp:
        return resp.read()


def _list_entries(api: HfApi, prefix: str, token: str | None) -> list[str]:
    """Bucket-relative file paths under *prefix* (folders filtered out)."""
    return [
        e.path
        for e in api.list_bucket_tree(
            RENDER_BUCKET, prefix=prefix, recursive=True, token=token,
        )
        if getattr(e, "path", None)
        and not getattr(e, "is_folder", False)
        and not e.path.endswith("/")
    ]


def _missing_stills(paths: list[str]) -> list[str]:
    """WebP paths whose sibling ``edit_diff.png`` is absent from the bucket."""
    present = set(paths)
    out = []
    for p in paths:
        if p.endswith("/" + WEBP_NAME):
            sibling = p[: -len(WEBP_NAME)] + PNG_NAME
            if sibling not in present:
                out.append(p)
    return sorted(out)


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--submission",
        help="Limit to one submission id (the prefix is renders/<id>). "
             "Omit to scan every submission in the bucket.",
    )
    parser.add_argument("--limit", type=int, default=None,
                        help="Process at most N stills (after listing).")
    parser.add_argument(
        "--dry-run", action="store_true",
        help="List what would be created; download/upload nothing.",
    )
    args = parser.parse_args()

    token = os.environ.get("HF_TOKEN")
    api = HfApi(token=token)
    prefix = (
        f"{RENDER_PREFIX}/{args.submission}" if args.submission else RENDER_PREFIX
    )

    print(f"Scanning bucket {RENDER_BUCKET} under {prefix}/ …", flush=True)
    paths = _list_entries(api, prefix, token)
    todo = _missing_stills(paths)
    if args.limit is not None:
        todo = todo[: args.limit]

    n_webp = sum(1 for p in paths if p.endswith("/" + WEBP_NAME))
    print(
        f"Found {n_webp} edit_diff.webp; {len(todo)} missing a still.",
        flush=True,
    )
    if not todo:
        print("Nothing to backfill.")
        return 0
    if args.dry_run:
        for p in todo:
            print("  would create", p[: -len(WEBP_NAME)] + PNG_NAME)
        return 0
    if not token:
        parser.error("HF_TOKEN required to upload (or pass --dry-run).")

    created = 0
    with tempfile.TemporaryDirectory(prefix="edit-diff-still-") as tmp:
        tmp_dir = Path(tmp)
        add: list[tuple[str, str]] = []
        for i, webp_path in enumerate(todo, start=1):
            dest = webp_path[: -len(WEBP_NAME)] + PNG_NAME
            try:
                png = first_frame_png(_download(webp_path, token))
            except Exception as e:  # noqa: BLE001 - skip a bad clip, keep going
                print(f"  [{i}/{len(todo)}] SKIP {webp_path} ({type(e).__name__}: {e})")
                continue
            local = tmp_dir / f"{i}.png"
            local.write_bytes(png)
            add.append((str(local), dest))
            print(f"  [{i}/{len(todo)}] {dest} ({len(png) // 1024} KB)", flush=True)
            if len(add) >= UPLOAD_CHUNK:
                api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
                created += len(add)
                add = []
        if add:
            api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
            created += len(add)

    print(f"Done. Uploaded {created} edit_diff.png still(s).", flush=True)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())