File size: 6,307 Bytes
c7f83a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python3
"""Backfill the static edit-diff still (``edit_diff.png``) into the render bucket.

Editing samples ship an *animated* ``edit_diff.webp`` turntable but no static
frame. The grid thumbnail needs a still (an animated WebP can't be frozen to one
angle in HTML), so the eval pipeline now also writes ``edit_diff.png`` (frame 0)
beside the clip — but submissions evaluated *before* that change only have the
WebP in the bucket.

This one-time tool closes that gap **without re-evaluating or re-rendering**: it
lists the public render bucket, finds every ``.../<fixture>/edit_diff.webp`` that
has no sibling ``edit_diff.png``, downloads the WebP, extracts frame 0 via the
shared :func:`cadgenbench.common.viewer.first_frame_png` (the exact frame the
forward pipeline now saves), and uploads ``edit_diff.png`` next to it in the same
bucket prefix — so it serves by the same anonymous render URL as every other
render.

Run (needs a write-scoped ``HF_TOKEN`` for the bucket)::

    HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py            # all submissions
    HF_TOKEN=<write-token> python tools/backfill_edit_diff_still.py --submission <id>
    python tools/backfill_edit_diff_still.py --dry-run                          # list only, no token needed
"""
from __future__ import annotations

import argparse
import os
import sys
import tempfile
import urllib.request
from pathlib import Path

from huggingface_hub import HfApi

# cadgenbench (for the shared frame extractor) must be importable; allow running
# straight from the repo without installing the leaderboard package.
_REPO_ROOT = Path(__file__).resolve().parents[2]
_SRC = _REPO_ROOT / "cadgenbench" / "src"
if _SRC.is_dir():
    sys.path.insert(0, str(_SRC))

from cadgenbench.common.imaging import first_frame_png  # noqa: E402

HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co").rstrip("/")
HF_ORG = os.getenv("HF_ORG", "HuggingAI4Engineering")
# Mirror leaderboard.py's defaults so the tool targets the same bucket the eval
# job uploads to and the report/gallery read from.
RENDER_BUCKET = os.getenv("HF_RENDER_BUCKET", f"{HF_ORG}/cadgenbench-eval-staging")
RENDER_PREFIX = "renders"
WEBP_NAME = "edit_diff.webp"
PNG_NAME = "edit_diff.png"
# One upload call per this many files: keeps an individual batch small and
# rate-limit friendly while still amortising the request overhead.
UPLOAD_CHUNK = 50


def _resolve_url(path: str) -> str:
    """Anonymous bucket resolve URL for a bucket-relative object path."""
    return f"{HF_ENDPOINT}/buckets/{RENDER_BUCKET}/resolve/{path}"


def _download(path: str, token: str | None) -> bytes:
    req = urllib.request.Request(_resolve_url(path))
    if token:
        req.add_header("Authorization", f"Bearer {token}")
    with urllib.request.urlopen(req, timeout=60) as resp:
        return resp.read()


def _list_entries(api: HfApi, prefix: str, token: str | None) -> list[str]:
    """Bucket-relative file paths under *prefix* (folders filtered out)."""
    return [
        e.path
        for e in api.list_bucket_tree(
            RENDER_BUCKET, prefix=prefix, recursive=True, token=token,
        )
        if getattr(e, "path", None)
        and not getattr(e, "is_folder", False)
        and not e.path.endswith("/")
    ]


def _missing_stills(paths: list[str]) -> list[str]:
    """WebP paths whose sibling ``edit_diff.png`` is absent from the bucket."""
    present = set(paths)
    out = []
    for p in paths:
        if p.endswith("/" + WEBP_NAME):
            sibling = p[: -len(WEBP_NAME)] + PNG_NAME
            if sibling not in present:
                out.append(p)
    return sorted(out)


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--submission",
        help="Limit to one submission id (the prefix is renders/<id>). "
             "Omit to scan every submission in the bucket.",
    )
    parser.add_argument("--limit", type=int, default=None,
                        help="Process at most N stills (after listing).")
    parser.add_argument(
        "--dry-run", action="store_true",
        help="List what would be created; download/upload nothing.",
    )
    args = parser.parse_args()

    token = os.environ.get("HF_TOKEN")
    api = HfApi(token=token)
    prefix = (
        f"{RENDER_PREFIX}/{args.submission}" if args.submission else RENDER_PREFIX
    )

    print(f"Scanning bucket {RENDER_BUCKET} under {prefix}/ …", flush=True)
    paths = _list_entries(api, prefix, token)
    todo = _missing_stills(paths)
    if args.limit is not None:
        todo = todo[: args.limit]

    n_webp = sum(1 for p in paths if p.endswith("/" + WEBP_NAME))
    print(
        f"Found {n_webp} edit_diff.webp; {len(todo)} missing a still.",
        flush=True,
    )
    if not todo:
        print("Nothing to backfill.")
        return 0
    if args.dry_run:
        for p in todo:
            print("  would create", p[: -len(WEBP_NAME)] + PNG_NAME)
        return 0
    if not token:
        parser.error("HF_TOKEN required to upload (or pass --dry-run).")

    created = 0
    with tempfile.TemporaryDirectory(prefix="edit-diff-still-") as tmp:
        tmp_dir = Path(tmp)
        add: list[tuple[str, str]] = []
        for i, webp_path in enumerate(todo, start=1):
            dest = webp_path[: -len(WEBP_NAME)] + PNG_NAME
            try:
                png = first_frame_png(_download(webp_path, token))
            except Exception as e:  # noqa: BLE001 - skip a bad clip, keep going
                print(f"  [{i}/{len(todo)}] SKIP {webp_path} ({type(e).__name__}: {e})")
                continue
            local = tmp_dir / f"{i}.png"
            local.write_bytes(png)
            add.append((str(local), dest))
            print(f"  [{i}/{len(todo)}] {dest} ({len(png) // 1024} KB)", flush=True)
            if len(add) >= UPLOAD_CHUNK:
                api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
                created += len(add)
                add = []
        if add:
            api.batch_bucket_files(RENDER_BUCKET, add=add, token=token)
            created += len(add)

    print(f"Done. Uploaded {created} edit_diff.png still(s).", flush=True)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())