Spaces:

luh0502
/

NeAR

Configuration error

luh1124 commited on 29 days ago

Commit

008d00e

1 Parent(s): 1aad7b7

fix(app): lazy-import trellis, preload NeAR on CPU, fix ZeroGPU duration

- Move trellis.pipelines import from module top-level into functions
to prevent gsplat from initialising CUDA in the main process.
- Enable background-thread CPU preload for NeAR (same pattern as Hunyuan3D
in app_hyshape.py) so the first @GPU callback only pays H2D move.
- Replace @_gpu(duration=600) with @GPU (120 s) and add @torch .no_grad()
so ZeroGPU scheduler accepts the lease and memory pressure is reduced.

Files changed (1) hide show

app.py +25 -21

app.py CHANGED Viewed

@@ -65,7 +65,9 @@ os.environ.setdefault("TORCH_CUDA_ARCH_LIST", "7.5;8.0;8.6;8.9;9.0")
 from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline  # type: ignore
 from hy3dshape.rembg import BackgroundRemover  # type: ignore
-from trellis.pipelines import NeARImageToRelightable3DPipeline
 # ── Paths ────────────────────────────────────────────────────────────────────
 APP_DIR = Path(__file__).resolve().parent
@@ -115,6 +117,7 @@ def _load_near_cpu_locked() -> None:
         return
     print("[NeAR] loading NeAR on CPU…", flush=True)
     t0 = time.time()
     PIPELINE = NeARImageToRelightable3DPipeline.from_pretrained("luh0502/NeAR")
     PIPELINE.to("cpu")
     # Ensure renderer/tone_mapper are NOT initialized on CPU — they need a live CUDA context.
@@ -137,13 +140,11 @@ def _load_geometry_cpu_locked() -> None:
 def _preload_worker() -> None:
-    """Mirror app_hyshape.py: load Hunyuan3D into CPU RAM under lock.
-    Hunyuan3D is safe to load in the main process (no CUDA init).
-    The GPU callback then only does .to("cuda") + inference — no download wait.
-    NeAR cannot be preloaded (BiRefNet triggers CUDA init in main process).
-    We only warm its disk cache so the GPU callback loads from disk, not network.
     """
     # Step 1: load Hunyuan3D into CPU RAM (same pattern as app_hyshape.py).
     try:
@@ -153,16 +154,19 @@ def _preload_worker() -> None:
     except Exception as exc:
         print(f"[NeAR] preload: Hunyuan3D failed: {exc}", flush=True)
-    # Step 2: warm NeAR disk cache (no CUDA, no lock, no instantiation).
     try:
-        from huggingface_hub import snapshot_download
-        snapshot_download(repo_id="luh0502/NeAR", token=os.environ.get("HF_TOKEN"))
-        print("[NeAR] preload: NeAR disk cache ready.", flush=True)
     except Exception as exc:
-        print(f"[NeAR] preload: NeAR disk cache failed: {exc}", flush=True)
     # Step 3: warm rembg model cache (briaai/RMBG-2.0, referenced in pipeline.yaml).
-    # Without this, the download happens inside the 240s GPU callback and times out.
     try:
         from huggingface_hub import snapshot_download
         snapshot_download(repo_id="briaai/RMBG-2.0", token=os.environ.get("HF_TOKEN"))
@@ -171,11 +175,8 @@ def _preload_worker() -> None:
         print(f"[NeAR] preload: RMBG-2.0 disk cache failed: {exc}", flush=True)
     # Step 4: pre-download DINOv2 weights file only (no model instantiation).
-    # torch.hub.load instantiates the model which imports xformers → triggers CUDA init
-    # in the main process, breaking ZeroGPU's context management.
-    # download_url_to_file is pure urllib — no CUDA. The GPU callback will still need
-    # to download the small GitHub repo code on cold start, but the 1.13 GB weights
-    # file is the slow part and will be served from this local cache.
     if not (os.environ.get("NEAR_DINO_LOCAL_REPO") or os.environ.get("NEAR_AUX_REPO")):
         try:
             import torch
@@ -197,8 +198,10 @@ def _preload_worker() -> None:
 # tone_mapper because each ZeroGPU call has a fresh CUDA context.
 def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
-    # NeAR loads lazily here (BiRefNet/DINOv2 need CUDA available — only safe
-    # inside @GPU callback, not in main process).
     # ZeroGPU runs one GPU callback at a time so no lock is needed.
     _load_near_cpu_locked()
     assert PIPELINE is not None
@@ -500,7 +503,8 @@ def load_slat_file(
     return state, f"SLaT loaded: `{Path(resolved).name}`"
-@_gpu(duration=600)
 def generate_renderings(
     asset_state: Dict[str, Any],
     hdri_file_obj: Any,

 from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline  # type: ignore
 from hy3dshape.rembg import BackgroundRemover  # type: ignore
+# NeAR is imported lazily inside functions to avoid pulling gsplat / heavy
+# submodules into the main process, which can initialise CUDA and break
+# Hugging Face ZeroGPU context management.
 # ── Paths ────────────────────────────────────────────────────────────────────
 APP_DIR = Path(__file__).resolve().parent
         return
     print("[NeAR] loading NeAR on CPU…", flush=True)
     t0 = time.time()
+    from trellis.pipelines import NeARImageToRelightable3DPipeline
     PIPELINE = NeARImageToRelightable3DPipeline.from_pretrained("luh0502/NeAR")
     PIPELINE.to("cpu")
     # Ensure renderer/tone_mapper are NOT initialized on CPU — they need a live CUDA context.
 def _preload_worker() -> None:
+    """Mirror app_hyshape.py: load Hunyuan3D + NeAR into CPU RAM under lock.
+    Hunyuan3D and NeAR are loaded on CPU only (no .to(cuda)) so they do not
+    initialise a CUDA context in the main process.  The GPU callbacks then only
+    pay H2D move + inference — no download wait.
     """
     # Step 1: load Hunyuan3D into CPU RAM (same pattern as app_hyshape.py).
     try:
     except Exception as exc:
         print(f"[NeAR] preload: Hunyuan3D failed: {exc}", flush=True)
+    # Step 2: load NeAR into CPU RAM.  Previously omitted because of a suspected
+    # BiRefNet CUDA-init side-effect, but BiRefNet (via transformers) loads on
+    # CPU by default.  Doing this in the background thread avoids a multi-GB
+    # from_pretrained + disk download inside the 120 s ZeroGPU lease.
     try:
+        with _MODEL_LOCK:
+            _load_near_cpu_locked()
+        print("[NeAR] preload: NeAR in CPU RAM.", flush=True)
     except Exception as exc:
+        print(f"[NeAR] preload: NeAR failed (will lazy-load in GPU callback): {exc}", flush=True)
     # Step 3: warm rembg model cache (briaai/RMBG-2.0, referenced in pipeline.yaml).
+    # Kept as a safety net even though BiRefNet is the default rembg backend.
     try:
         from huggingface_hub import snapshot_download
         snapshot_download(repo_id="briaai/RMBG-2.0", token=os.environ.get("HF_TOKEN"))
         print(f"[NeAR] preload: RMBG-2.0 disk cache failed: {exc}", flush=True)
     # Step 4: pre-download DINOv2 weights file only (no model instantiation).
+    # If Step 2 succeeded, torch.hub.load already cached the weights.  This step
+    # is kept as a fallback when NeAR preloading is skipped or fails.
     if not (os.environ.get("NEAR_DINO_LOCAL_REPO") or os.environ.get("NEAR_AUX_REPO")):
         try:
             import torch
 # tone_mapper because each ZeroGPU call has a fresh CUDA context.
 def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
+    # NeAR is pre-loaded into CPU RAM by the background thread.  This call
+    # simply moves weights to CUDA and re-creates renderer / tone_mapper for
+    # the fresh ZeroGPU context.  If preloading failed, it falls back to a
+    # full from_pretrained inside the GPU callback (slower, same result).
     # ZeroGPU runs one GPU callback at a time so no lock is needed.
     _load_near_cpu_locked()
     assert PIPELINE is not None
     return state, f"SLaT loaded: `{Path(resolved).name}`"
+@GPU
+@torch.no_grad()
 def generate_renderings(
     asset_state: Dict[str, Any],
     hdri_file_obj: Any,