Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

CADGenBench / tools /backfill_edit_diff_still.py

Michael Rabinovich

leaderboard: backfill tool for edit_diff.png stills in render bucket

c7f83a5 5 days ago

6.31 kB

	#!/usr/bin/env python3
	"""Backfill the static edit-diff still (``edit_diff.png``) into the render bucket.

	Editing samples ship an animated ``edit_diff.webp`` turntable but no static
	frame. The grid thumbnail needs a still (an animated WebP can't be frozen to one
	angle in HTML), so the eval pipeline now also writes ``edit_diff.png`` (frame 0)
	beside the clip — but submissions evaluated before that change only have the
	WebP in the bucket.

	This one-time tool closes that gap without re-evaluating or re-rendering: it
	lists the public render bucket, finds every ``.../<fixture>/edit_diff.webp`` that
	has no sibling ``edit_diff.png``, downloads the WebP, extracts frame 0 via the
	shared :func:`cadgenbench.common.viewer.first_frame_png` (the exact frame the
	forward pipeline now saves), and uploads ``edit_diff.png`` next to it in the same
	bucket prefix — so it serves by the same anonymous render URL as every other
	render.

	Run (needs a write-scoped ``HF_TOKEN`` for the bucket)::

	HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py # all submissions
	HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py --submission <id>
	python tools/backfill_edit_diff_still.py --dry-run # list only, no token needed
	"""
	from __future__ import annotations

	import argparse
	import os
	import sys
	import tempfile
	import urllib.request
	from pathlib import Path

	from huggingface_hub import HfApi

	# cadgenbench (for the shared frame extractor) must be importable; allow running
	# straight from the repo without installing the leaderboard package.
	_REPO_ROOT = Path(__file__).resolve().parents[2]
	_SRC = _REPO_ROOT / "cadgenbench" / "src"
	if _SRC.is_dir():
	sys.path.insert(0, str(_SRC))

	from cadgenbench.common.imaging import first_frame_png # noqa: E402

	HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co").rstrip("/")
	HF_ORG = os.getenv("HF_ORG", "HuggingAI4Engineering")
	# Mirror leaderboard.py's defaults so the tool targets the same bucket the eval
	# job uploads to and the report/gallery read from.
	RENDER_BUCKET = os.getenv("HF_RENDER_BUCKET", f"{HF_ORG}/cadgenbench-eval-staging")
	RENDER_PREFIX = "renders"
	WEBP_NAME = "edit_diff.webp"
	PNG_NAME = "edit_diff.png"
	# One upload call per this many files: keeps an individual batch small and
	# rate-limit friendly while still amortising the request overhead.
	UPLOAD_CHUNK = 50


	def _resolve_url(path: str) -> str:
	"""Anonymous bucket resolve URL for a bucket-relative object path."""
	return f"{HF_ENDPOINT}/buckets/{RENDER_BUCKET}/resolve/{path}"


	def _download(path: str, token: str \| None) -> bytes:
	req = urllib.request.Request(_resolve_url(path))
	if token:
	req.add_header("Authorization", f"Bearer {token}")
	with urllib.request.urlopen(req, timeout=60) as resp:
	return resp.read()


	def _list_entries(api: HfApi, prefix: str, token: str \| None) -> list[str]:
	"""Bucket-relative file paths under prefix (folders filtered out)."""
	return [
	e.path
	for e in api.list_bucket_tree(
	RENDER_BUCKET, prefix=prefix, recursive=True, token=token,
	)
	if getattr(e, "path", None)
	and not getattr(e, "is_folder", False)
	and not e.path.endswith("/")
	]


	def _missing_stills(paths: list[str]) -> list[str]:
	"""WebP paths whose sibling ``edit_diff.png`` is absent from the bucket."""
	present = set(paths)
	out = []
	for p in paths:
	if p.endswith("/" + WEBP_NAME):
	sibling = p[: -len(WEBP_NAME)] + PNG_NAME
	if sibling not in present:
	out.append(p)
	return sorted(out)


	def main() -> int:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--submission",
	help="Limit to one submission id (the prefix is renders/<id>). "
	"Omit to scan every submission in the bucket.",
	)
	parser.add_argument("--limit", type=int, default=None,
	help="Process at most N stills (after listing).")
	parser.add_argument(
	"--dry-run", action="store_true",
	help="List what would be created; download/upload nothing.",
	)
	args = parser.parse_args()

	token = os.environ.get("HF_TOKEN")
	api = HfApi(token=token)
	prefix = (
	f"{RENDER_PREFIX}/{args.submission}" if args.submission else RENDER_PREFIX
	)

	print(f"Scanning bucket {RENDER_BUCKET} under {prefix}/ …", flush=True)
	paths = _list_entries(api, prefix, token)
	todo = _missing_stills(paths)
	if args.limit is not None:
	todo = todo[: args.limit]

	n_webp = sum(1 for p in paths if p.endswith("/" + WEBP_NAME))
	print(
	f"Found {n_webp} edit_diff.webp; {len(todo)} missing a still.",
	flush=True,
	)
	if not todo:
	print("Nothing to backfill.")
	return 0
	if args.dry_run:
	for p in todo:
	print(" would create", p[: -len(WEBP_NAME)] + PNG_NAME)
	return 0
	if not token:
	parser.error("HF_TOKEN required to upload (or pass --dry-run).")

	created = 0
	with tempfile.TemporaryDirectory(prefix="edit-diff-still-") as tmp:
	tmp_dir = Path(tmp)
	add: list[tuple[str, str]] = []
	for i, webp_path in enumerate(todo, start=1):
	dest = webp_path[: -len(WEBP_NAME)] + PNG_NAME
	try:
	png = first_frame_png(_download(webp_path, token))
	except Exception as e: # noqa: BLE001 - skip a bad clip, keep going
	print(f" [{i}/{len(todo)}] SKIP {webp_path} ({type(e).__name__}: {e})")
	continue
	local = tmp_dir / f"{i}.png"
	local.write_bytes(png)
	add.append((str(local), dest))
	print(f" [{i}/{len(todo)}] {dest} ({len(png) // 1024} KB)", flush=True)
	if len(add) >= UPLOAD_CHUNK:
	api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
	created += len(add)
	add = []
	if add:
	api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
	created += len(add)

	print(f"Done. Uploaded {created} edit_diff.png still(s).", flush=True)
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())