ZeroPointMonkey commited on
Commit
852fbdc
Β·
1 Parent(s): 4a26525

perf: run HT-Demucs reference cleaning on GPU (ZeroGPU CUDA EP)

Browse files

isolate_voice was CPU-bound at ~180s on the 2-vCPU Space, tripping the bot's voice timeouts. Decorate it with @spaces.GPU and select the CUDA onnxruntime execution provider inside the GPU scope (onnxruntime-gpu added), with a CPU fallback so local/CPU and any CUDA-init failure still work. The clone path calls the undecorated core to avoid nested GPU allocation. _MODEL_LOCK leak-fix and the /clone + /isolate_voice API signatures are unchanged.

Files changed (2) hide show
  1. app.py +65 -11
  2. requirements.txt +9 -0
app.py CHANGED
@@ -40,8 +40,8 @@ MODEL = None
40
  #
41
  # Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
42
  # critical section so a request owns the model exclusively for its full
43
- # synthesis. The (CPU, GPU-budget-free) reference cleaning runs OUTSIDE the
44
- # lock to keep the exclusive window as short as possible.
45
  _MODEL_LOCK = threading.Lock()
46
 
47
  # ── Faithful-cloning defaults ────────────────────────────────────────────────
@@ -118,8 +118,14 @@ def set_seed(seed: int):
118
  # clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
119
  # SDR) via the pure-numpy + onnxruntime `demucs-onnx` package β€” no torch/
120
  # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
121
- # Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
122
- # member of a future "audio cleanup" feature group (denoise, trim, normalize…).
 
 
 
 
 
 
123
  _SEPARATOR_READY = None
124
 
125
 
@@ -136,11 +142,42 @@ def _ensure_separator():
136
  return _SEPARATOR_READY or None
137
 
138
 
139
- def isolate_voice(audio_path: str) -> str:
140
- """Return a path to a cleaned WAV with background music/noise removed.
141
 
142
- Falls back to the original clip (and warns) if separation is unavailable
143
- or fails, so cloning never hard-breaks on a cleanup error.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  """
145
  if not audio_path:
146
  return audio_path
@@ -153,8 +190,11 @@ def isolate_voice(audio_path: str) -> str:
153
  except Exception: # noqa: BLE001
154
  sr = 44100
155
 
156
- # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
157
- vocals = separate_stem(audio_path, "vocals", providers="cpu") # (channels, samples)
 
 
 
158
  vocals = np.asarray(vocals, dtype=np.float32)
159
  if vocals.ndim == 2:
160
  vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
@@ -171,6 +211,17 @@ def isolate_voice(audio_path: str) -> str:
171
  return out_path
172
 
173
 
 
 
 
 
 
 
 
 
 
 
 
174
  def isolate_voice_ui(audio_path: str):
175
  """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
176
  if not audio_path:
@@ -218,7 +269,10 @@ def _maybe_clean_reference(ref: str, clean_reference: bool) -> str:
218
  if not (clean_reference and ref):
219
  return ref
220
  try:
221
- return isolate_voice(ref)
 
 
 
222
  except Exception as e: # noqa: BLE001
223
  gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
224
  return ref
 
40
  #
41
  # Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
42
  # critical section so a request owns the model exclusively for its full
43
+ # synthesis. The reference cleaning (HT-Demucs) runs OUTSIDE the lock to keep
44
+ # the exclusive window as short as possible β€” it touches no shared model state.
45
  _MODEL_LOCK = threading.Lock()
46
 
47
  # ── Faithful-cloning defaults ────────────────────────────────────────────────
 
118
  # clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
119
  # SDR) via the pure-numpy + onnxruntime `demucs-onnx` package β€” no torch/
120
  # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
121
+ #
122
+ # PERF: HT-Demucs on the Space's ~2-vCPU CPU took up to/over 180s, which
123
+ # tripped the bot's voice timeouts. On ZeroGPU, CUDA is ONLY visible inside a
124
+ # `@spaces.GPU`-scoped call, so the public entrypoint (`isolate_voice`) is now
125
+ # GPU-decorated and runs the ONNX graph on the CUDA execution provider
126
+ # (requires `onnxruntime-gpu`; see requirements.txt). We ALWAYS keep a CPU
127
+ # provider in the list as a fallback, so local/CPU and any CUDA-init failure
128
+ # still produce a clean reference (just slower) instead of hard-breaking.
129
  _SEPARATOR_READY = None
130
 
131
 
 
142
  return _SEPARATOR_READY or None
143
 
144
 
145
+ def _separation_providers():
146
+ """Pick ONNX Runtime execution providers for HT-Demucs.
147
 
148
+ Inside a `@spaces.GPU` scope on ZeroGPU, CUDA is available, so we run the
149
+ separation on the GPU (seconds vs minutes on the 2-vCPU CPU). CPU is always
150
+ kept as a fallback in the list, so if the CUDA EP can't be created (no
151
+ onnxruntime-gpu, cuDNN mismatch, or local/CPU host) ORT silently drops to
152
+ CPU instead of raising. Returns a value `demucs_onnx.separate_stem` accepts
153
+ for its `providers` kwarg.
154
+ """
155
+ try:
156
+ if torch.cuda.is_available():
157
+ import onnxruntime as ort # noqa: PLC0415
158
+ # ZeroGPU has no global CUDA install; borrow the CUDA/cuDNN libs that
159
+ # torch already loaded (ORT >= 1.20). Best-effort β€” older builds find
160
+ # them on the path because torch is imported at module top.
161
+ if hasattr(ort, "preload_dlls"):
162
+ try:
163
+ ort.preload_dlls()
164
+ except Exception as e: # noqa: BLE001
165
+ print(f"WARNING: onnxruntime.preload_dlls() failed: {e}")
166
+ if "CUDAExecutionProvider" in ort.get_available_providers():
167
+ return ["CUDAExecutionProvider", "CPUExecutionProvider"]
168
+ print("WARNING: CUDAExecutionProvider not available to onnxruntime; "
169
+ "isolate_voice will run on CPU. Is onnxruntime-gpu installed?")
170
+ except Exception as e: # noqa: BLE001
171
+ print(f"WARNING: GPU provider selection failed, using CPU: {e}")
172
+ return "cpu"
173
+
174
+
175
+ def _isolate_voice_impl(audio_path: str) -> str:
176
+ """Core voice-isolation routine (NOT GPU-decorated).
177
+
178
+ Callable directly from another `@spaces.GPU` function (e.g. the clone path)
179
+ where CUDA is already attached, avoiding a nested GPU allocation. The public
180
+ `isolate_voice` wrapper below adds the GPU scope for the standalone endpoint.
181
  """
182
  if not audio_path:
183
  return audio_path
 
190
  except Exception: # noqa: BLE001
191
  sr = 44100
192
 
193
+ # htdemucs_ft vocals specialist. Runs on CUDA when available (see
194
+ # _separation_providers); falls back to CPU otherwise.
195
+ providers = _separation_providers()
196
+ print(f"isolate_voice: onnxruntime providers={providers}")
197
+ vocals = separate_stem(audio_path, "vocals", providers=providers) # (channels, samples)
198
  vocals = np.asarray(vocals, dtype=np.float32)
199
  if vocals.ndim == 2:
200
  vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
 
211
  return out_path
212
 
213
 
214
+ @spaces.GPU(duration=90)
215
+ def isolate_voice(audio_path: str) -> str:
216
+ """Return a path to a cleaned WAV with background music/noise removed.
217
+
218
+ GPU-scoped so HT-Demucs runs on CUDA under ZeroGPU (was CPU-bound at ~180s).
219
+ Falls back to the original clip (and warns) if separation is unavailable
220
+ or fails, so cloning never hard-breaks on a cleanup error.
221
+ """
222
+ return _isolate_voice_impl(audio_path)
223
+
224
+
225
  def isolate_voice_ui(audio_path: str):
226
  """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
227
  if not audio_path:
 
269
  if not (clean_reference and ref):
270
  return ref
271
  try:
272
+ # Call the undecorated core: clone_and_speak is already @spaces.GPU, so
273
+ # CUDA is attached here. Re-entering isolate_voice (also GPU-decorated)
274
+ # would nest GPU allocations.
275
+ return _isolate_voice_impl(ref)
276
  except Exception as e: # noqa: BLE001
277
  gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
278
  return ref
requirements.txt CHANGED
@@ -20,6 +20,15 @@ safetensors
20
  # Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
21
  demucs-onnx==0.3.3
22
 
 
 
 
 
 
 
 
 
 
23
  # Optional language-specific normalizers (disabled for build reliability β€” English-first prototype).
24
  # Re-enable only if you need advanced zh / ja / ru text normalization:
25
  # spacy_pkuseg # Chinese text segmentation
 
20
  # Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
21
  demucs-onnx==0.3.3
22
 
23
+ # GPU execution provider for ONNX Runtime so HT-Demucs reference-cleaning runs on
24
+ # the ZeroGPU CUDA device (was CPU-bound at ~180s, tripping the bot's voice
25
+ # timeouts). demucs-onnx requires the CPU `onnxruntime`; this adds the CUDA EP
26
+ # alongside it. 1.20.1 = last line with cp310 wheels + CUDA 12 / cuDNN 9 default
27
+ # (matches the ZeroGPU A10G torch stack) and exposes onnxruntime.preload_dlls().
28
+ # Selection is runtime-guarded with a CPU fallback (see app.py _separation_providers),
29
+ # so a missing/mismatched CUDA EP degrades to CPU instead of breaking the Space.
30
+ onnxruntime-gpu==1.20.1
31
+
32
  # Optional language-specific normalizers (disabled for build reliability β€” English-first prototype).
33
  # Re-enable only if you need advanced zh / ja / ru text normalization:
34
  # spacy_pkuseg # Chinese text segmentation