fix(app): lazy-import trellis, preload NeAR on CPU, fix ZeroGPU duration
Browse files- Move trellis.pipelines import from module top-level into functions
to prevent gsplat from initialising CUDA in the main process.
- Enable background-thread CPU preload for NeAR (same pattern as Hunyuan3D
in app_hyshape.py) so the first @GPU callback only pays H2D move.
- Replace @_gpu(duration=600) with @GPU (120 s) and add @torch .no_grad()
so ZeroGPU scheduler accepts the lease and memory pressure is reduced.
app.py
CHANGED
|
@@ -65,7 +65,9 @@ os.environ.setdefault("TORCH_CUDA_ARCH_LIST", "7.5;8.0;8.6;8.9;9.0")
|
|
| 65 |
|
| 66 |
from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline # type: ignore
|
| 67 |
from hy3dshape.rembg import BackgroundRemover # type: ignore
|
| 68 |
-
|
|
|
|
|
|
|
| 69 |
|
| 70 |
# ββ Paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 71 |
APP_DIR = Path(__file__).resolve().parent
|
|
@@ -115,6 +117,7 @@ def _load_near_cpu_locked() -> None:
|
|
| 115 |
return
|
| 116 |
print("[NeAR] loading NeAR on CPUβ¦", flush=True)
|
| 117 |
t0 = time.time()
|
|
|
|
| 118 |
PIPELINE = NeARImageToRelightable3DPipeline.from_pretrained("luh0502/NeAR")
|
| 119 |
PIPELINE.to("cpu")
|
| 120 |
# Ensure renderer/tone_mapper are NOT initialized on CPU β they need a live CUDA context.
|
|
@@ -137,13 +140,11 @@ def _load_geometry_cpu_locked() -> None:
|
|
| 137 |
|
| 138 |
|
| 139 |
def _preload_worker() -> None:
|
| 140 |
-
"""Mirror app_hyshape.py: load Hunyuan3D into CPU RAM under lock.
|
| 141 |
|
| 142 |
-
Hunyuan3D
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
NeAR cannot be preloaded (BiRefNet triggers CUDA init in main process).
|
| 146 |
-
We only warm its disk cache so the GPU callback loads from disk, not network.
|
| 147 |
"""
|
| 148 |
# Step 1: load Hunyuan3D into CPU RAM (same pattern as app_hyshape.py).
|
| 149 |
try:
|
|
@@ -153,16 +154,19 @@ def _preload_worker() -> None:
|
|
| 153 |
except Exception as exc:
|
| 154 |
print(f"[NeAR] preload: Hunyuan3D failed: {exc}", flush=True)
|
| 155 |
|
| 156 |
-
# Step 2:
|
|
|
|
|
|
|
|
|
|
| 157 |
try:
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
print("[NeAR] preload: NeAR
|
| 161 |
except Exception as exc:
|
| 162 |
-
print(f"[NeAR] preload: NeAR
|
| 163 |
|
| 164 |
# Step 3: warm rembg model cache (briaai/RMBG-2.0, referenced in pipeline.yaml).
|
| 165 |
-
#
|
| 166 |
try:
|
| 167 |
from huggingface_hub import snapshot_download
|
| 168 |
snapshot_download(repo_id="briaai/RMBG-2.0", token=os.environ.get("HF_TOKEN"))
|
|
@@ -171,11 +175,8 @@ def _preload_worker() -> None:
|
|
| 171 |
print(f"[NeAR] preload: RMBG-2.0 disk cache failed: {exc}", flush=True)
|
| 172 |
|
| 173 |
# Step 4: pre-download DINOv2 weights file only (no model instantiation).
|
| 174 |
-
# torch.hub.load
|
| 175 |
-
#
|
| 176 |
-
# download_url_to_file is pure urllib β no CUDA. The GPU callback will still need
|
| 177 |
-
# to download the small GitHub repo code on cold start, but the 1.13 GB weights
|
| 178 |
-
# file is the slow part and will be served from this local cache.
|
| 179 |
if not (os.environ.get("NEAR_DINO_LOCAL_REPO") or os.environ.get("NEAR_AUX_REPO")):
|
| 180 |
try:
|
| 181 |
import torch
|
|
@@ -197,8 +198,10 @@ def _preload_worker() -> None:
|
|
| 197 |
# tone_mapper because each ZeroGPU call has a fresh CUDA context.
|
| 198 |
|
| 199 |
def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
|
| 200 |
-
# NeAR
|
| 201 |
-
#
|
|
|
|
|
|
|
| 202 |
# ZeroGPU runs one GPU callback at a time so no lock is needed.
|
| 203 |
_load_near_cpu_locked()
|
| 204 |
assert PIPELINE is not None
|
|
@@ -500,7 +503,8 @@ def load_slat_file(
|
|
| 500 |
return state, f"SLaT loaded: `{Path(resolved).name}`"
|
| 501 |
|
| 502 |
|
| 503 |
-
@
|
|
|
|
| 504 |
def generate_renderings(
|
| 505 |
asset_state: Dict[str, Any],
|
| 506 |
hdri_file_obj: Any,
|
|
|
|
| 65 |
|
| 66 |
from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline # type: ignore
|
| 67 |
from hy3dshape.rembg import BackgroundRemover # type: ignore
|
| 68 |
+
# NeAR is imported lazily inside functions to avoid pulling gsplat / heavy
|
| 69 |
+
# submodules into the main process, which can initialise CUDA and break
|
| 70 |
+
# Hugging Face ZeroGPU context management.
|
| 71 |
|
| 72 |
# ββ Paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 73 |
APP_DIR = Path(__file__).resolve().parent
|
|
|
|
| 117 |
return
|
| 118 |
print("[NeAR] loading NeAR on CPUβ¦", flush=True)
|
| 119 |
t0 = time.time()
|
| 120 |
+
from trellis.pipelines import NeARImageToRelightable3DPipeline
|
| 121 |
PIPELINE = NeARImageToRelightable3DPipeline.from_pretrained("luh0502/NeAR")
|
| 122 |
PIPELINE.to("cpu")
|
| 123 |
# Ensure renderer/tone_mapper are NOT initialized on CPU β they need a live CUDA context.
|
|
|
|
| 140 |
|
| 141 |
|
| 142 |
def _preload_worker() -> None:
|
| 143 |
+
"""Mirror app_hyshape.py: load Hunyuan3D + NeAR into CPU RAM under lock.
|
| 144 |
|
| 145 |
+
Hunyuan3D and NeAR are loaded on CPU only (no .to(cuda)) so they do not
|
| 146 |
+
initialise a CUDA context in the main process. The GPU callbacks then only
|
| 147 |
+
pay H2D move + inference β no download wait.
|
|
|
|
|
|
|
| 148 |
"""
|
| 149 |
# Step 1: load Hunyuan3D into CPU RAM (same pattern as app_hyshape.py).
|
| 150 |
try:
|
|
|
|
| 154 |
except Exception as exc:
|
| 155 |
print(f"[NeAR] preload: Hunyuan3D failed: {exc}", flush=True)
|
| 156 |
|
| 157 |
+
# Step 2: load NeAR into CPU RAM. Previously omitted because of a suspected
|
| 158 |
+
# BiRefNet CUDA-init side-effect, but BiRefNet (via transformers) loads on
|
| 159 |
+
# CPU by default. Doing this in the background thread avoids a multi-GB
|
| 160 |
+
# from_pretrained + disk download inside the 120 s ZeroGPU lease.
|
| 161 |
try:
|
| 162 |
+
with _MODEL_LOCK:
|
| 163 |
+
_load_near_cpu_locked()
|
| 164 |
+
print("[NeAR] preload: NeAR in CPU RAM.", flush=True)
|
| 165 |
except Exception as exc:
|
| 166 |
+
print(f"[NeAR] preload: NeAR failed (will lazy-load in GPU callback): {exc}", flush=True)
|
| 167 |
|
| 168 |
# Step 3: warm rembg model cache (briaai/RMBG-2.0, referenced in pipeline.yaml).
|
| 169 |
+
# Kept as a safety net even though BiRefNet is the default rembg backend.
|
| 170 |
try:
|
| 171 |
from huggingface_hub import snapshot_download
|
| 172 |
snapshot_download(repo_id="briaai/RMBG-2.0", token=os.environ.get("HF_TOKEN"))
|
|
|
|
| 175 |
print(f"[NeAR] preload: RMBG-2.0 disk cache failed: {exc}", flush=True)
|
| 176 |
|
| 177 |
# Step 4: pre-download DINOv2 weights file only (no model instantiation).
|
| 178 |
+
# If Step 2 succeeded, torch.hub.load already cached the weights. This step
|
| 179 |
+
# is kept as a fallback when NeAR preloading is skipped or fails.
|
|
|
|
|
|
|
|
|
|
| 180 |
if not (os.environ.get("NEAR_DINO_LOCAL_REPO") or os.environ.get("NEAR_AUX_REPO")):
|
| 181 |
try:
|
| 182 |
import torch
|
|
|
|
| 198 |
# tone_mapper because each ZeroGPU call has a fresh CUDA context.
|
| 199 |
|
| 200 |
def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
|
| 201 |
+
# NeAR is pre-loaded into CPU RAM by the background thread. This call
|
| 202 |
+
# simply moves weights to CUDA and re-creates renderer / tone_mapper for
|
| 203 |
+
# the fresh ZeroGPU context. If preloading failed, it falls back to a
|
| 204 |
+
# full from_pretrained inside the GPU callback (slower, same result).
|
| 205 |
# ZeroGPU runs one GPU callback at a time so no lock is needed.
|
| 206 |
_load_near_cpu_locked()
|
| 207 |
assert PIPELINE is not None
|
|
|
|
| 503 |
return state, f"SLaT loaded: `{Path(resolved).name}`"
|
| 504 |
|
| 505 |
|
| 506 |
+
@GPU
|
| 507 |
+
@torch.no_grad()
|
| 508 |
def generate_renderings(
|
| 509 |
asset_state: Dict[str, Any],
|
| 510 |
hdri_file_obj: Any,
|