luh1124 commited on
Commit
008d00e
Β·
1 Parent(s): 1aad7b7

fix(app): lazy-import trellis, preload NeAR on CPU, fix ZeroGPU duration

Browse files

- Move trellis.pipelines import from module top-level into functions
to prevent gsplat from initialising CUDA in the main process.
- Enable background-thread CPU preload for NeAR (same pattern as Hunyuan3D
in app_hyshape.py) so the first @GPU callback only pays H2D move.
- Replace @_gpu(duration=600) with @GPU (120 s) and add @torch .no_grad()
so ZeroGPU scheduler accepts the lease and memory pressure is reduced.

Files changed (1) hide show
  1. app.py +25 -21
app.py CHANGED
@@ -65,7 +65,9 @@ os.environ.setdefault("TORCH_CUDA_ARCH_LIST", "7.5;8.0;8.6;8.9;9.0")
65
 
66
  from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline # type: ignore
67
  from hy3dshape.rembg import BackgroundRemover # type: ignore
68
- from trellis.pipelines import NeARImageToRelightable3DPipeline
 
 
69
 
70
  # ── Paths ────────────────────────────────────────────────────────────────────
71
  APP_DIR = Path(__file__).resolve().parent
@@ -115,6 +117,7 @@ def _load_near_cpu_locked() -> None:
115
  return
116
  print("[NeAR] loading NeAR on CPU…", flush=True)
117
  t0 = time.time()
 
118
  PIPELINE = NeARImageToRelightable3DPipeline.from_pretrained("luh0502/NeAR")
119
  PIPELINE.to("cpu")
120
  # Ensure renderer/tone_mapper are NOT initialized on CPU β€” they need a live CUDA context.
@@ -137,13 +140,11 @@ def _load_geometry_cpu_locked() -> None:
137
 
138
 
139
  def _preload_worker() -> None:
140
- """Mirror app_hyshape.py: load Hunyuan3D into CPU RAM under lock.
141
 
142
- Hunyuan3D is safe to load in the main process (no CUDA init).
143
- The GPU callback then only does .to("cuda") + inference β€” no download wait.
144
-
145
- NeAR cannot be preloaded (BiRefNet triggers CUDA init in main process).
146
- We only warm its disk cache so the GPU callback loads from disk, not network.
147
  """
148
  # Step 1: load Hunyuan3D into CPU RAM (same pattern as app_hyshape.py).
149
  try:
@@ -153,16 +154,19 @@ def _preload_worker() -> None:
153
  except Exception as exc:
154
  print(f"[NeAR] preload: Hunyuan3D failed: {exc}", flush=True)
155
 
156
- # Step 2: warm NeAR disk cache (no CUDA, no lock, no instantiation).
 
 
 
157
  try:
158
- from huggingface_hub import snapshot_download
159
- snapshot_download(repo_id="luh0502/NeAR", token=os.environ.get("HF_TOKEN"))
160
- print("[NeAR] preload: NeAR disk cache ready.", flush=True)
161
  except Exception as exc:
162
- print(f"[NeAR] preload: NeAR disk cache failed: {exc}", flush=True)
163
 
164
  # Step 3: warm rembg model cache (briaai/RMBG-2.0, referenced in pipeline.yaml).
165
- # Without this, the download happens inside the 240s GPU callback and times out.
166
  try:
167
  from huggingface_hub import snapshot_download
168
  snapshot_download(repo_id="briaai/RMBG-2.0", token=os.environ.get("HF_TOKEN"))
@@ -171,11 +175,8 @@ def _preload_worker() -> None:
171
  print(f"[NeAR] preload: RMBG-2.0 disk cache failed: {exc}", flush=True)
172
 
173
  # Step 4: pre-download DINOv2 weights file only (no model instantiation).
174
- # torch.hub.load instantiates the model which imports xformers β†’ triggers CUDA init
175
- # in the main process, breaking ZeroGPU's context management.
176
- # download_url_to_file is pure urllib β€” no CUDA. The GPU callback will still need
177
- # to download the small GitHub repo code on cold start, but the 1.13 GB weights
178
- # file is the slow part and will be served from this local cache.
179
  if not (os.environ.get("NEAR_DINO_LOCAL_REPO") or os.environ.get("NEAR_AUX_REPO")):
180
  try:
181
  import torch
@@ -197,8 +198,10 @@ def _preload_worker() -> None:
197
  # tone_mapper because each ZeroGPU call has a fresh CUDA context.
198
 
199
  def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
200
- # NeAR loads lazily here (BiRefNet/DINOv2 need CUDA available β€” only safe
201
- # inside @GPU callback, not in main process).
 
 
202
  # ZeroGPU runs one GPU callback at a time so no lock is needed.
203
  _load_near_cpu_locked()
204
  assert PIPELINE is not None
@@ -500,7 +503,8 @@ def load_slat_file(
500
  return state, f"SLaT loaded: `{Path(resolved).name}`"
501
 
502
 
503
- @_gpu(duration=600)
 
504
  def generate_renderings(
505
  asset_state: Dict[str, Any],
506
  hdri_file_obj: Any,
 
65
 
66
  from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline # type: ignore
67
  from hy3dshape.rembg import BackgroundRemover # type: ignore
68
+ # NeAR is imported lazily inside functions to avoid pulling gsplat / heavy
69
+ # submodules into the main process, which can initialise CUDA and break
70
+ # Hugging Face ZeroGPU context management.
71
 
72
  # ── Paths ────────────────────────────────────────────────────────────────────
73
  APP_DIR = Path(__file__).resolve().parent
 
117
  return
118
  print("[NeAR] loading NeAR on CPU…", flush=True)
119
  t0 = time.time()
120
+ from trellis.pipelines import NeARImageToRelightable3DPipeline
121
  PIPELINE = NeARImageToRelightable3DPipeline.from_pretrained("luh0502/NeAR")
122
  PIPELINE.to("cpu")
123
  # Ensure renderer/tone_mapper are NOT initialized on CPU β€” they need a live CUDA context.
 
140
 
141
 
142
  def _preload_worker() -> None:
143
+ """Mirror app_hyshape.py: load Hunyuan3D + NeAR into CPU RAM under lock.
144
 
145
+ Hunyuan3D and NeAR are loaded on CPU only (no .to(cuda)) so they do not
146
+ initialise a CUDA context in the main process. The GPU callbacks then only
147
+ pay H2D move + inference β€” no download wait.
 
 
148
  """
149
  # Step 1: load Hunyuan3D into CPU RAM (same pattern as app_hyshape.py).
150
  try:
 
154
  except Exception as exc:
155
  print(f"[NeAR] preload: Hunyuan3D failed: {exc}", flush=True)
156
 
157
+ # Step 2: load NeAR into CPU RAM. Previously omitted because of a suspected
158
+ # BiRefNet CUDA-init side-effect, but BiRefNet (via transformers) loads on
159
+ # CPU by default. Doing this in the background thread avoids a multi-GB
160
+ # from_pretrained + disk download inside the 120 s ZeroGPU lease.
161
  try:
162
+ with _MODEL_LOCK:
163
+ _load_near_cpu_locked()
164
+ print("[NeAR] preload: NeAR in CPU RAM.", flush=True)
165
  except Exception as exc:
166
+ print(f"[NeAR] preload: NeAR failed (will lazy-load in GPU callback): {exc}", flush=True)
167
 
168
  # Step 3: warm rembg model cache (briaai/RMBG-2.0, referenced in pipeline.yaml).
169
+ # Kept as a safety net even though BiRefNet is the default rembg backend.
170
  try:
171
  from huggingface_hub import snapshot_download
172
  snapshot_download(repo_id="briaai/RMBG-2.0", token=os.environ.get("HF_TOKEN"))
 
175
  print(f"[NeAR] preload: RMBG-2.0 disk cache failed: {exc}", flush=True)
176
 
177
  # Step 4: pre-download DINOv2 weights file only (no model instantiation).
178
+ # If Step 2 succeeded, torch.hub.load already cached the weights. This step
179
+ # is kept as a fallback when NeAR preloading is skipped or fails.
 
 
 
180
  if not (os.environ.get("NEAR_DINO_LOCAL_REPO") or os.environ.get("NEAR_AUX_REPO")):
181
  try:
182
  import torch
 
198
  # tone_mapper because each ZeroGPU call has a fresh CUDA context.
199
 
200
  def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
201
+ # NeAR is pre-loaded into CPU RAM by the background thread. This call
202
+ # simply moves weights to CUDA and re-creates renderer / tone_mapper for
203
+ # the fresh ZeroGPU context. If preloading failed, it falls back to a
204
+ # full from_pretrained inside the GPU callback (slower, same result).
205
  # ZeroGPU runs one GPU callback at a time so no lock is needed.
206
  _load_near_cpu_locked()
207
  assert PIPELINE is not None
 
503
  return state, f"SLaT loaded: `{Path(resolved).name}`"
504
 
505
 
506
+ @GPU
507
+ @torch.no_grad()
508
  def generate_renderings(
509
  asset_state: Dict[str, Any],
510
  hdri_file_obj: Any,