ZeroPointMonkey commited on
Commit
751f97c
Β·
1 Parent(s): 852fbdc

Revert "perf: run HT-Demucs reference cleaning on GPU (ZeroGPU CUDA EP)"

Browse files

This reverts commit 852fbdcded8d8b32402a347e632470a0b50643b8.

Files changed (2) hide show
  1. app.py +11 -65
  2. requirements.txt +0 -9
app.py CHANGED
@@ -40,8 +40,8 @@ MODEL = None
40
  #
41
  # Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
42
  # critical section so a request owns the model exclusively for its full
43
- # synthesis. The reference cleaning (HT-Demucs) runs OUTSIDE the lock to keep
44
- # the exclusive window as short as possible β€” it touches no shared model state.
45
  _MODEL_LOCK = threading.Lock()
46
 
47
  # ── Faithful-cloning defaults ────────────────────────────────────────────────
@@ -118,14 +118,8 @@ def set_seed(seed: int):
118
  # clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
119
  # SDR) via the pure-numpy + onnxruntime `demucs-onnx` package β€” no torch/
120
  # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
121
- #
122
- # PERF: HT-Demucs on the Space's ~2-vCPU CPU took up to/over 180s, which
123
- # tripped the bot's voice timeouts. On ZeroGPU, CUDA is ONLY visible inside a
124
- # `@spaces.GPU`-scoped call, so the public entrypoint (`isolate_voice`) is now
125
- # GPU-decorated and runs the ONNX graph on the CUDA execution provider
126
- # (requires `onnxruntime-gpu`; see requirements.txt). We ALWAYS keep a CPU
127
- # provider in the list as a fallback, so local/CPU and any CUDA-init failure
128
- # still produce a clean reference (just slower) instead of hard-breaking.
129
  _SEPARATOR_READY = None
130
 
131
 
@@ -142,42 +136,11 @@ def _ensure_separator():
142
  return _SEPARATOR_READY or None
143
 
144
 
145
- def _separation_providers():
146
- """Pick ONNX Runtime execution providers for HT-Demucs.
147
-
148
- Inside a `@spaces.GPU` scope on ZeroGPU, CUDA is available, so we run the
149
- separation on the GPU (seconds vs minutes on the 2-vCPU CPU). CPU is always
150
- kept as a fallback in the list, so if the CUDA EP can't be created (no
151
- onnxruntime-gpu, cuDNN mismatch, or local/CPU host) ORT silently drops to
152
- CPU instead of raising. Returns a value `demucs_onnx.separate_stem` accepts
153
- for its `providers` kwarg.
154
- """
155
- try:
156
- if torch.cuda.is_available():
157
- import onnxruntime as ort # noqa: PLC0415
158
- # ZeroGPU has no global CUDA install; borrow the CUDA/cuDNN libs that
159
- # torch already loaded (ORT >= 1.20). Best-effort β€” older builds find
160
- # them on the path because torch is imported at module top.
161
- if hasattr(ort, "preload_dlls"):
162
- try:
163
- ort.preload_dlls()
164
- except Exception as e: # noqa: BLE001
165
- print(f"WARNING: onnxruntime.preload_dlls() failed: {e}")
166
- if "CUDAExecutionProvider" in ort.get_available_providers():
167
- return ["CUDAExecutionProvider", "CPUExecutionProvider"]
168
- print("WARNING: CUDAExecutionProvider not available to onnxruntime; "
169
- "isolate_voice will run on CPU. Is onnxruntime-gpu installed?")
170
- except Exception as e: # noqa: BLE001
171
- print(f"WARNING: GPU provider selection failed, using CPU: {e}")
172
- return "cpu"
173
-
174
-
175
- def _isolate_voice_impl(audio_path: str) -> str:
176
- """Core voice-isolation routine (NOT GPU-decorated).
177
 
178
- Callable directly from another `@spaces.GPU` function (e.g. the clone path)
179
- where CUDA is already attached, avoiding a nested GPU allocation. The public
180
- `isolate_voice` wrapper below adds the GPU scope for the standalone endpoint.
181
  """
182
  if not audio_path:
183
  return audio_path
@@ -190,11 +153,8 @@ def _isolate_voice_impl(audio_path: str) -> str:
190
  except Exception: # noqa: BLE001
191
  sr = 44100
192
 
193
- # htdemucs_ft vocals specialist. Runs on CUDA when available (see
194
- # _separation_providers); falls back to CPU otherwise.
195
- providers = _separation_providers()
196
- print(f"isolate_voice: onnxruntime providers={providers}")
197
- vocals = separate_stem(audio_path, "vocals", providers=providers) # (channels, samples)
198
  vocals = np.asarray(vocals, dtype=np.float32)
199
  if vocals.ndim == 2:
200
  vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
@@ -211,17 +171,6 @@ def _isolate_voice_impl(audio_path: str) -> str:
211
  return out_path
212
 
213
 
214
- @spaces.GPU(duration=90)
215
- def isolate_voice(audio_path: str) -> str:
216
- """Return a path to a cleaned WAV with background music/noise removed.
217
-
218
- GPU-scoped so HT-Demucs runs on CUDA under ZeroGPU (was CPU-bound at ~180s).
219
- Falls back to the original clip (and warns) if separation is unavailable
220
- or fails, so cloning never hard-breaks on a cleanup error.
221
- """
222
- return _isolate_voice_impl(audio_path)
223
-
224
-
225
  def isolate_voice_ui(audio_path: str):
226
  """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
227
  if not audio_path:
@@ -269,10 +218,7 @@ def _maybe_clean_reference(ref: str, clean_reference: bool) -> str:
269
  if not (clean_reference and ref):
270
  return ref
271
  try:
272
- # Call the undecorated core: clone_and_speak is already @spaces.GPU, so
273
- # CUDA is attached here. Re-entering isolate_voice (also GPU-decorated)
274
- # would nest GPU allocations.
275
- return _isolate_voice_impl(ref)
276
  except Exception as e: # noqa: BLE001
277
  gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
278
  return ref
 
40
  #
41
  # Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
42
  # critical section so a request owns the model exclusively for its full
43
+ # synthesis. The (CPU, GPU-budget-free) reference cleaning runs OUTSIDE the
44
+ # lock to keep the exclusive window as short as possible.
45
  _MODEL_LOCK = threading.Lock()
46
 
47
  # ── Faithful-cloning defaults ────────────────────────────────────────────────
 
118
  # clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
119
  # SDR) via the pure-numpy + onnxruntime `demucs-onnx` package β€” no torch/
120
  # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
121
+ # Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
122
+ # member of a future "audio cleanup" feature group (denoise, trim, normalize…).
 
 
 
 
 
 
123
  _SEPARATOR_READY = None
124
 
125
 
 
136
  return _SEPARATOR_READY or None
137
 
138
 
139
+ def isolate_voice(audio_path: str) -> str:
140
+ """Return a path to a cleaned WAV with background music/noise removed.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ Falls back to the original clip (and warns) if separation is unavailable
143
+ or fails, so cloning never hard-breaks on a cleanup error.
 
144
  """
145
  if not audio_path:
146
  return audio_path
 
153
  except Exception: # noqa: BLE001
154
  sr = 44100
155
 
156
+ # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
157
+ vocals = separate_stem(audio_path, "vocals", providers="cpu") # (channels, samples)
 
 
 
158
  vocals = np.asarray(vocals, dtype=np.float32)
159
  if vocals.ndim == 2:
160
  vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
 
171
  return out_path
172
 
173
 
 
 
 
 
 
 
 
 
 
 
 
174
  def isolate_voice_ui(audio_path: str):
175
  """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
176
  if not audio_path:
 
218
  if not (clean_reference and ref):
219
  return ref
220
  try:
221
+ return isolate_voice(ref)
 
 
 
222
  except Exception as e: # noqa: BLE001
223
  gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
224
  return ref
requirements.txt CHANGED
@@ -20,15 +20,6 @@ safetensors
20
  # Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
21
  demucs-onnx==0.3.3
22
 
23
- # GPU execution provider for ONNX Runtime so HT-Demucs reference-cleaning runs on
24
- # the ZeroGPU CUDA device (was CPU-bound at ~180s, tripping the bot's voice
25
- # timeouts). demucs-onnx requires the CPU `onnxruntime`; this adds the CUDA EP
26
- # alongside it. 1.20.1 = last line with cp310 wheels + CUDA 12 / cuDNN 9 default
27
- # (matches the ZeroGPU A10G torch stack) and exposes onnxruntime.preload_dlls().
28
- # Selection is runtime-guarded with a CPU fallback (see app.py _separation_providers),
29
- # so a missing/mismatched CUDA EP degrades to CPU instead of breaking the Space.
30
- onnxruntime-gpu==1.20.1
31
-
32
  # Optional language-specific normalizers (disabled for build reliability β€” English-first prototype).
33
  # Re-enable only if you need advanced zh / ja / ru text normalization:
34
  # spacy_pkuseg # Chinese text segmentation
 
20
  # Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
21
  demucs-onnx==0.3.3
22
 
 
 
 
 
 
 
 
 
 
23
  # Optional language-specific normalizers (disabled for build reliability β€” English-first prototype).
24
  # Re-enable only if you need advanced zh / ja / ru text normalization:
25
  # spacy_pkuseg # Chinese text segmentation