File size: 6,307 Bytes
c7f83a5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | #!/usr/bin/env python3
"""Backfill the static edit-diff still (``edit_diff.png``) into the render bucket.
Editing samples ship an *animated* ``edit_diff.webp`` turntable but no static
frame. The grid thumbnail needs a still (an animated WebP can't be frozen to one
angle in HTML), so the eval pipeline now also writes ``edit_diff.png`` (frame 0)
beside the clip — but submissions evaluated *before* that change only have the
WebP in the bucket.
This one-time tool closes that gap **without re-evaluating or re-rendering**: it
lists the public render bucket, finds every ``.../<fixture>/edit_diff.webp`` that
has no sibling ``edit_diff.png``, downloads the WebP, extracts frame 0 via the
shared :func:`cadgenbench.common.viewer.first_frame_png` (the exact frame the
forward pipeline now saves), and uploads ``edit_diff.png`` next to it in the same
bucket prefix — so it serves by the same anonymous render URL as every other
render.
Run (needs a write-scoped ``HF_TOKEN`` for the bucket)::
HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py # all submissions
HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py --submission <id>
python tools/backfill_edit_diff_still.py --dry-run # list only, no token needed
"""
from __future__ import annotations
import argparse
import os
import sys
import tempfile
import urllib.request
from pathlib import Path
from huggingface_hub import HfApi
# cadgenbench (for the shared frame extractor) must be importable; allow running
# straight from the repo without installing the leaderboard package.
_REPO_ROOT = Path(__file__).resolve().parents[2]
_SRC = _REPO_ROOT / "cadgenbench" / "src"
if _SRC.is_dir():
sys.path.insert(0, str(_SRC))
from cadgenbench.common.imaging import first_frame_png # noqa: E402
HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co").rstrip("/")
HF_ORG = os.getenv("HF_ORG", "HuggingAI4Engineering")
# Mirror leaderboard.py's defaults so the tool targets the same bucket the eval
# job uploads to and the report/gallery read from.
RENDER_BUCKET = os.getenv("HF_RENDER_BUCKET", f"{HF_ORG}/cadgenbench-eval-staging")
RENDER_PREFIX = "renders"
WEBP_NAME = "edit_diff.webp"
PNG_NAME = "edit_diff.png"
# One upload call per this many files: keeps an individual batch small and
# rate-limit friendly while still amortising the request overhead.
UPLOAD_CHUNK = 50
def _resolve_url(path: str) -> str:
"""Anonymous bucket resolve URL for a bucket-relative object path."""
return f"{HF_ENDPOINT}/buckets/{RENDER_BUCKET}/resolve/{path}"
def _download(path: str, token: str | None) -> bytes:
req = urllib.request.Request(_resolve_url(path))
if token:
req.add_header("Authorization", f"Bearer {token}")
with urllib.request.urlopen(req, timeout=60) as resp:
return resp.read()
def _list_entries(api: HfApi, prefix: str, token: str | None) -> list[str]:
"""Bucket-relative file paths under *prefix* (folders filtered out)."""
return [
e.path
for e in api.list_bucket_tree(
RENDER_BUCKET, prefix=prefix, recursive=True, token=token,
)
if getattr(e, "path", None)
and not getattr(e, "is_folder", False)
and not e.path.endswith("/")
]
def _missing_stills(paths: list[str]) -> list[str]:
"""WebP paths whose sibling ``edit_diff.png`` is absent from the bucket."""
present = set(paths)
out = []
for p in paths:
if p.endswith("/" + WEBP_NAME):
sibling = p[: -len(WEBP_NAME)] + PNG_NAME
if sibling not in present:
out.append(p)
return sorted(out)
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--submission",
help="Limit to one submission id (the prefix is renders/<id>). "
"Omit to scan every submission in the bucket.",
)
parser.add_argument("--limit", type=int, default=None,
help="Process at most N stills (after listing).")
parser.add_argument(
"--dry-run", action="store_true",
help="List what would be created; download/upload nothing.",
)
args = parser.parse_args()
token = os.environ.get("HF_TOKEN")
api = HfApi(token=token)
prefix = (
f"{RENDER_PREFIX}/{args.submission}" if args.submission else RENDER_PREFIX
)
print(f"Scanning bucket {RENDER_BUCKET} under {prefix}/ …", flush=True)
paths = _list_entries(api, prefix, token)
todo = _missing_stills(paths)
if args.limit is not None:
todo = todo[: args.limit]
n_webp = sum(1 for p in paths if p.endswith("/" + WEBP_NAME))
print(
f"Found {n_webp} edit_diff.webp; {len(todo)} missing a still.",
flush=True,
)
if not todo:
print("Nothing to backfill.")
return 0
if args.dry_run:
for p in todo:
print(" would create", p[: -len(WEBP_NAME)] + PNG_NAME)
return 0
if not token:
parser.error("HF_TOKEN required to upload (or pass --dry-run).")
created = 0
with tempfile.TemporaryDirectory(prefix="edit-diff-still-") as tmp:
tmp_dir = Path(tmp)
add: list[tuple[str, str]] = []
for i, webp_path in enumerate(todo, start=1):
dest = webp_path[: -len(WEBP_NAME)] + PNG_NAME
try:
png = first_frame_png(_download(webp_path, token))
except Exception as e: # noqa: BLE001 - skip a bad clip, keep going
print(f" [{i}/{len(todo)}] SKIP {webp_path} ({type(e).__name__}: {e})")
continue
local = tmp_dir / f"{i}.png"
local.write_bytes(png)
add.append((str(local), dest))
print(f" [{i}/{len(todo)}] {dest} ({len(png) // 1024} KB)", flush=True)
if len(add) >= UPLOAD_CHUNK:
api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
created += len(add)
add = []
if add:
api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
created += len(add)
print(f"Done. Uploaded {created} edit_diff.png still(s).", flush=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())
|