Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

File size: 15,555 Bytes

49e27be

#!/usr/bin/env python3
"""Generate the ground-truth "answer key" edit-diff turntables (editing fixtures).

For each *editing* fixture (one that ships an ``input.step`` seed) this renders
the reference companion to the per-submission edit diff: the GT drawn as a
translucent ghost with the **correct change painted blue** (added material on the
GT body, removed material as a blue phantom of the input). See
:func:`cadgenbench.common.edit_diff.build_gt_edit_diff_shapes`.

Like :mod:`generate_gt_turntables`, the result is a property of the **data
revision** (GT vs input), not of any submission, so this runs once per data
revision and both the gallery's ground-truth row and every per-submission report
reference the same webp via the GT proxy. One clip is written per fixture:

- ``<fixture>/renders/edit_diff_gt.webp`` -- full turntable.

The GT mesh comes from the trusted sidecar (no tessellation); the input mesh is
tessellated once at the GT's deflection so the GT-vs-input edit region is found
at one consistent scale (mirrors the eval's ``_editing_input_mesh``).

Run locally (against checkouts), render only::

    python tools/generate_gt_edit_diff.py \
        --gt-root ../cadgenbench-data-gt --inputs-root ../cadgenbench-data \
        --out-dir ../out/gt_edit_diff --no-upload

Add ``--upload`` (and an ``HF_TOKEN`` with **write** scope on the private GT
dataset) to commit the webps, or run it on an HF GPU job exactly like
``generate_gt_turntables.py``.
"""
from __future__ import annotations

import argparse
import os
import subprocess
import sys
import tempfile
from pathlib import Path

from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download

# Allow running straight from the repo without installing the leaderboard pkg;
# cadgenbench itself must be importable (installed in the env / eval-gpu image).
_REPO_ROOT = Path(__file__).resolve().parents[2]
_SRC = _REPO_ROOT / "cadgenbench" / "src"
if _SRC.is_dir():
    sys.path.insert(0, str(_SRC))

from cadgenbench.common.artifacts import StepArtifacts  # noqa: E402
from cadgenbench.common.edit_diff import render_gt_edit_diff_turntable  # noqa: E402

GT_STEP_NAME = "ground_truth.step"
GT_SIDECAR_NAME = "ground_truth.mesh.npz"
INPUT_STEP_NAME = "input.step"
FULL_NAME = "renders/edit_diff_gt.webp"
# One commit per this many files: keeps an individual commit small and
# rate-limit friendly.
COMMIT_CHUNK = 60


def _default_repo_id() -> str:
    return os.getenv(
        "HF_DATA_GT_REPO",
        f"{os.getenv('HF_ORG', 'HuggingAI4Engineering')}/cadgenbench-data-gt",
    )


def _default_inputs_repo_id() -> str:
    return os.getenv(
        "HF_DATA_REPO",
        f"{os.getenv('HF_ORG', 'HuggingAI4Engineering')}/cadgenbench-data",
    )


def _editing_fixture_ids(
    api: HfApi,
    gt_repo: str,
    inputs_repo: str,
    gt_root: Path | None,
    inputs_root: Path | None,
) -> list[str]:
    """Fixture ids with BOTH a ``ground_truth.step`` and an ``input.step``.

    The ``input.step`` is what defines an editing fixture, so the intersection
    of the two repos (or two checkouts) is exactly the editing set.
    """
    if gt_root is not None:
        gt_ids = {
            p.name for p in gt_root.iterdir()
            if p.is_dir() and (p / GT_STEP_NAME).is_file()
        }
    else:
        files = api.list_repo_files(gt_repo, repo_type="dataset")
        gt_ids = {f.split("/", 1)[0] for f in files if f.endswith("/" + GT_STEP_NAME)}

    if inputs_root is not None:
        in_ids = {
            p.name for p in inputs_root.iterdir()
            if p.is_dir() and (p / INPUT_STEP_NAME).is_file()
        }
    else:
        files = api.list_repo_files(inputs_repo, repo_type="dataset")
        in_ids = {f.split("/", 1)[0] for f in files if f.endswith("/" + INPUT_STEP_NAME)}

    return sorted(gt_ids & in_ids, key=lambda s: (len(s), s))


def _materialize_gt(
    api: HfApi, repo_id: str, fixture: str, gt_root: Path | None,
    cache_dir: Path, token: str | None,
) -> Path:
    """Local dir holding this fixture's GT STEP + trusted mesh sidecar.

    The sidecar must sit next to the STEP so ``StepArtifacts`` takes the
    trusted-mesh path (no tessellation, no validation).
    """
    if gt_root is not None:
        return gt_root / fixture
    dest = cache_dir / "gt" / fixture
    dest.mkdir(parents=True, exist_ok=True)
    for name in (GT_STEP_NAME, GT_SIDECAR_NAME):
        local = hf_hub_download(
            repo_id=repo_id, filename=f"{fixture}/{name}",
            repo_type="dataset", token=token,
        )
        target = dest / name
        if not target.exists():
            target.write_bytes(Path(local).read_bytes())
    return dest


def _materialize_input(
    api: HfApi, repo_id: str, fixture: str, inputs_root: Path | None,
    cache_dir: Path, token: str | None,
) -> Path:
    """Local path to this fixture's ``input.step`` (checkout or Hub download)."""
    if inputs_root is not None:
        return inputs_root / fixture / INPUT_STEP_NAME
    local = hf_hub_download(
        repo_id=repo_id, filename=f"{fixture}/{INPUT_STEP_NAME}",
        repo_type="dataset", token=token,
    )
    return Path(local)


def _render_fixture(gt_dir: Path, input_step: Path) -> bytes:
    """Render the full answer-key turntable WebP for one editing fixture."""
    gt_mesh = StepArtifacts(gt_dir / GT_STEP_NAME, is_ground_truth=True).mesh()
    input_mesh = StepArtifacts(
        input_step, deflection_override=gt_mesh.linear_deflection_mm,
    ).mesh()
    return render_gt_edit_diff_turntable(gt_mesh, input_mesh)


def _commit_in_chunks(api: HfApi, repo_id: str, ops: list[CommitOperationAdd]) -> None:
    for i in range(0, len(ops), COMMIT_CHUNK):
        chunk = ops[i:i + COMMIT_CHUNK]
        api.create_commit(
            repo_id=repo_id, repo_type="dataset", operations=chunk,
            commit_message=f"add GT edit-diff answer-key webp(s) [{i + 1}-{i + len(chunk)}]",
        )
        print(f"  committed {len(chunk)} file(s)", flush=True)


def _resolved_fixtures(
    parser: argparse.ArgumentParser, args: argparse.Namespace,
    api: HfApi, gt_root: Path | None, inputs_root: Path | None,
) -> list[str]:
    fixtures = _editing_fixture_ids(
        api, args.repo_id, args.inputs_repo_id, gt_root, inputs_root,
    )
    if args.fixtures:
        wanted = {f.strip() for f in args.fixtures.split(",") if f.strip()}
        fixtures = [f for f in fixtures if f in wanted]
    if args.limit is not None:
        fixtures = fixtures[: args.limit]
    if not fixtures:
        parser.error("No editing fixtures matched.")
    return fixtures


def _upload_from_out_dir(api: HfApi, repo_id: str, out_dir: Path, fixtures: list[str]) -> None:
    """Commit already-rendered webps/pngs under *out_dir* to the GT dataset."""
    ops: list[CommitOperationAdd] = []
    for fixture in fixtures:
        local = out_dir / fixture / "renders" / "edit_diff_gt.webp"
        if local.exists():
            ops.append(CommitOperationAdd(f"{fixture}/{FULL_NAME}", local.read_bytes()))
    if not ops:
        print("Nothing to upload (no rendered files found in --out-dir).", flush=True)
        return
    print(f"Uploading {len(ops)} file(s) to {repo_id} ...", flush=True)
    _commit_in_chunks(api, repo_id, ops)


def _run_upload_only(parser: argparse.ArgumentParser, args: argparse.Namespace) -> int:
    """Commit already-rendered ``edit_diff_gt.webp`` files from --out-dir."""
    if args.out_dir is None or not args.out_dir.is_dir():
        parser.error("--upload-only requires an existing --out-dir.")
    token = os.environ.get("HF_TOKEN")
    api = HfApi(token=token)  # falls back to the stored CLI token when env unset
    out_dir = args.out_dir.resolve()
    fixtures = sorted(
        (p.parent.parent.name for p in out_dir.glob("*/renders/edit_diff_gt.webp")),
        key=lambda s: (len(s), s),
    )
    if not fixtures:
        parser.error(f"No edit_diff_gt.webp found under {out_dir}")
    print(f"Uploading {len(fixtures)} fixture webp(s) from {out_dir} -> {args.repo_id}", flush=True)
    print(f"FIXTURES: {' '.join(fixtures)}", flush=True)
    _upload_from_out_dir(api, args.repo_id, out_dir, fixtures)
    print("Done.", flush=True)
    return 0


def _run_isolated(parser: argparse.ArgumentParser, args: argparse.Namespace) -> int:
    """Render each fixture in a fresh subprocess (one fixture == ~240 plotters).

    Spawns this same tool with ``--fixtures <id> --no-upload`` per fixture so the
    GL context is fully released between fixtures, then (optionally) uploads once
    from ``--out-dir``. Worker stdout/stderr inherit the parent's, so progress
    and the VTK noise land in the same streams the non-isolated path uses.
    """
    if args.out_dir is None:
        parser.error("--isolate requires --out-dir (workers render to disk).")
    token = os.environ.get("HF_TOKEN")
    if not args.no_upload and not token:
        parser.error("HF_TOKEN required to upload (or pass --no-upload).")
    api = HfApi(token=token)
    gt_root = args.gt_root.resolve() if args.gt_root else None
    inputs_root = args.inputs_root.resolve() if args.inputs_root else None
    for label, root in (("--gt-root", gt_root), ("--inputs-root", inputs_root)):
        if root is not None and not root.is_dir():
            parser.error(f"{label} does not exist: {root}")

    fixtures = _resolved_fixtures(parser, args, api, gt_root, inputs_root)
    print(f"Isolated render of {len(fixtures)} editing fixture(s) (one subprocess each).", flush=True)
    print(f"FIXTURES: {' '.join(fixtures)}", flush=True)

    base_cmd = [sys.executable, str(Path(__file__).resolve()),
                "--out-dir", str(args.out_dir), "--no-upload",
                "--repo-id", args.repo_id, "--inputs-repo-id", args.inputs_repo_id]
    if gt_root is not None:
        base_cmd += ["--gt-root", str(gt_root)]
    if inputs_root is not None:
        base_cmd += ["--inputs-root", str(inputs_root)]

    failures: list[str] = []
    for i, fixture in enumerate(fixtures, start=1):
        print(f"=== [{i}/{len(fixtures)}] {fixture} ===", flush=True)
        proc = subprocess.run([*base_cmd, "--fixtures", fixture])  # noqa: S603, PLW1510
        if proc.returncode != 0:
            failures.append(fixture)

    done = len(fixtures) - len(failures)
    print(f"Isolated render complete: {done}/{len(fixtures)} ok, {len(failures)} failed.", flush=True)
    if failures:
        print(f"FAILED: {' '.join(failures)}", flush=True)
    if not args.no_upload:
        _upload_from_out_dir(api, args.repo_id, args.out_dir, fixtures)
        print("Done.", flush=True)
    return 1 if failures else 0


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--gt-root", type=Path, default=None,
        help="Local cadgenbench-data-gt checkout. Omit to download from the Hub.",
    )
    parser.add_argument(
        "--inputs-root", type=Path, default=None,
        help="Local cadgenbench-data checkout (holds input.step). Omit for Hub.",
    )
    parser.add_argument("--repo-id", default=_default_repo_id())
    parser.add_argument("--inputs-repo-id", default=_default_inputs_repo_id())
    parser.add_argument("--fixtures", help="Comma-separated fixture ids. Omit for all editing fixtures.")
    parser.add_argument("--limit", type=int, default=None)
    parser.add_argument(
        "--out-dir", type=Path, default=None,
        help="Also write each webp/png here (e.g. for local inspection).",
    )
    parser.add_argument(
        "--no-upload", action="store_true",
        help="Render only; do not commit to the GT dataset.",
    )
    parser.add_argument(
        "--upload-only", action="store_true",
        help=(
            "Skip rendering; commit the ``edit_diff_gt.webp`` files already under "
            "--out-dir to the GT dataset. Use after an isolated render run."
        ),
    )
    parser.add_argument(
        "--isolate", action="store_true",
        help=(
            "Render each fixture in its own subprocess. Works around macOS "
            "offscreen VTK losing its GL context after many sequential Plotter "
            "create/close cycles (not needed on the Linux EGL eval job). Implies "
            "render-to-out-dir; upload, if requested, runs once from --out-dir."
        ),
    )
    args = parser.parse_args()

    if args.upload_only:
        return _run_upload_only(parser, args)
    if args.isolate:
        return _run_isolated(parser, args)

    token = os.environ.get("HF_TOKEN")
    api = HfApi(token=token)
    gt_root = args.gt_root.resolve() if args.gt_root else None
    inputs_root = args.inputs_root.resolve() if args.inputs_root else None
    for label, root in (("--gt-root", gt_root), ("--inputs-root", inputs_root)):
        if root is not None and not root.is_dir():
            parser.error(f"{label} does not exist: {root}")

    fixtures = _editing_fixture_ids(
        api, args.repo_id, args.inputs_repo_id, gt_root, inputs_root,
    )
    if args.fixtures:
        wanted = {f.strip() for f in args.fixtures.split(",") if f.strip()}
        fixtures = [f for f in fixtures if f in wanted]
    if args.limit is not None:
        fixtures = fixtures[: args.limit]
    if not fixtures:
        parser.error("No editing fixtures matched.")

    if not args.no_upload and not token:
        parser.error("HF_TOKEN required to upload (or pass --no-upload).")

    print(
        f"Rendering {len(fixtures)} editing GT answer-key turntable(s)"
        + ("" if args.no_upload else f" -> {args.repo_id} (will upload)"),
        flush=True,
    )
    print(f"FIXTURES: {' '.join(fixtures)}", flush=True)

    ops: list[CommitOperationAdd] = []
    failures: list[str] = []
    with tempfile.TemporaryDirectory(prefix="gt-edit-diff-") as tmp:
        cache_dir = Path(tmp)
        for i, fixture in enumerate(fixtures, start=1):
            print(f"[{i}/{len(fixtures)}] {fixture} ...", flush=True)
            try:
                gt_dir = _materialize_gt(
                    api, args.repo_id, fixture, gt_root, cache_dir, token,
                )
                input_step = _materialize_input(
                    api, args.inputs_repo_id, fixture, inputs_root, cache_dir, token,
                )
                full = _render_fixture(gt_dir, input_step)
            except Exception as e:  # noqa: BLE001 - log and keep going
                print(f"    FAILED {type(e).__name__}: {e}", flush=True)
                failures.append(fixture)
                continue

            print(f"    ok: full={len(full) // 1024}KB", flush=True)

            if args.out_dir is not None:
                fx_out = args.out_dir / fixture / "renders"
                fx_out.mkdir(parents=True, exist_ok=True)
                (fx_out / "edit_diff_gt.webp").write_bytes(full)

            ops.append(CommitOperationAdd(f"{fixture}/{FULL_NAME}", full))

        done = len(fixtures) - len(failures)
        print(
            f"Rendered {done}/{len(fixtures)} fixture(s) ({len(failures)} failed).",
            flush=True,
        )
        if failures:
            print(f"FAILED: {' '.join(failures)}", flush=True)
        if args.no_upload:
            print("Upload skipped (--no-upload).", flush=True)
            return 1 if failures else 0
        print(f"Uploading {len(ops)} file(s) to {args.repo_id} ...", flush=True)
        _commit_in_chunks(api, args.repo_id, ops)
    print("Done.", flush=True)
    return 1 if failures else 0


if __name__ == "__main__":
    raise SystemExit(main())