Commit Β·
852fbdc
1
Parent(s): 4a26525
perf: run HT-Demucs reference cleaning on GPU (ZeroGPU CUDA EP)
Browse filesisolate_voice was CPU-bound at ~180s on the 2-vCPU Space, tripping the bot's voice timeouts. Decorate it with @spaces.GPU and select the CUDA onnxruntime execution provider inside the GPU scope (onnxruntime-gpu added), with a CPU fallback so local/CPU and any CUDA-init failure still work. The clone path calls the undecorated core to avoid nested GPU allocation. _MODEL_LOCK leak-fix and the /clone + /isolate_voice API signatures are unchanged.
- app.py +65 -11
- requirements.txt +9 -0
app.py
CHANGED
|
@@ -40,8 +40,8 @@ MODEL = None
|
|
| 40 |
#
|
| 41 |
# Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
|
| 42 |
# critical section so a request owns the model exclusively for its full
|
| 43 |
-
# synthesis. The
|
| 44 |
-
#
|
| 45 |
_MODEL_LOCK = threading.Lock()
|
| 46 |
|
| 47 |
# ββ Faithful-cloning defaults ββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -118,8 +118,14 @@ def set_seed(seed: int):
|
|
| 118 |
# clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
|
| 119 |
# SDR) via the pure-numpy + onnxruntime `demucs-onnx` package β no torch/
|
| 120 |
# torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
|
| 121 |
-
#
|
| 122 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
_SEPARATOR_READY = None
|
| 124 |
|
| 125 |
|
|
@@ -136,11 +142,42 @@ def _ensure_separator():
|
|
| 136 |
return _SEPARATOR_READY or None
|
| 137 |
|
| 138 |
|
| 139 |
-
def
|
| 140 |
-
"""
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
"""
|
| 145 |
if not audio_path:
|
| 146 |
return audio_path
|
|
@@ -153,8 +190,11 @@ def isolate_voice(audio_path: str) -> str:
|
|
| 153 |
except Exception: # noqa: BLE001
|
| 154 |
sr = 44100
|
| 155 |
|
| 156 |
-
# htdemucs_ft vocals specialist
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
| 158 |
vocals = np.asarray(vocals, dtype=np.float32)
|
| 159 |
if vocals.ndim == 2:
|
| 160 |
vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
|
|
@@ -171,6 +211,17 @@ def isolate_voice(audio_path: str) -> str:
|
|
| 171 |
return out_path
|
| 172 |
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
def isolate_voice_ui(audio_path: str):
|
| 175 |
"""UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
|
| 176 |
if not audio_path:
|
|
@@ -218,7 +269,10 @@ def _maybe_clean_reference(ref: str, clean_reference: bool) -> str:
|
|
| 218 |
if not (clean_reference and ref):
|
| 219 |
return ref
|
| 220 |
try:
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
| 222 |
except Exception as e: # noqa: BLE001
|
| 223 |
gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
|
| 224 |
return ref
|
|
|
|
| 40 |
#
|
| 41 |
# Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
|
| 42 |
# critical section so a request owns the model exclusively for its full
|
| 43 |
+
# synthesis. The reference cleaning (HT-Demucs) runs OUTSIDE the lock to keep
|
| 44 |
+
# the exclusive window as short as possible β it touches no shared model state.
|
| 45 |
_MODEL_LOCK = threading.Lock()
|
| 46 |
|
| 47 |
# ββ Faithful-cloning defaults ββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 118 |
# clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
|
| 119 |
# SDR) via the pure-numpy + onnxruntime `demucs-onnx` package β no torch/
|
| 120 |
# torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
|
| 121 |
+
#
|
| 122 |
+
# PERF: HT-Demucs on the Space's ~2-vCPU CPU took up to/over 180s, which
|
| 123 |
+
# tripped the bot's voice timeouts. On ZeroGPU, CUDA is ONLY visible inside a
|
| 124 |
+
# `@spaces.GPU`-scoped call, so the public entrypoint (`isolate_voice`) is now
|
| 125 |
+
# GPU-decorated and runs the ONNX graph on the CUDA execution provider
|
| 126 |
+
# (requires `onnxruntime-gpu`; see requirements.txt). We ALWAYS keep a CPU
|
| 127 |
+
# provider in the list as a fallback, so local/CPU and any CUDA-init failure
|
| 128 |
+
# still produce a clean reference (just slower) instead of hard-breaking.
|
| 129 |
_SEPARATOR_READY = None
|
| 130 |
|
| 131 |
|
|
|
|
| 142 |
return _SEPARATOR_READY or None
|
| 143 |
|
| 144 |
|
| 145 |
+
def _separation_providers():
|
| 146 |
+
"""Pick ONNX Runtime execution providers for HT-Demucs.
|
| 147 |
|
| 148 |
+
Inside a `@spaces.GPU` scope on ZeroGPU, CUDA is available, so we run the
|
| 149 |
+
separation on the GPU (seconds vs minutes on the 2-vCPU CPU). CPU is always
|
| 150 |
+
kept as a fallback in the list, so if the CUDA EP can't be created (no
|
| 151 |
+
onnxruntime-gpu, cuDNN mismatch, or local/CPU host) ORT silently drops to
|
| 152 |
+
CPU instead of raising. Returns a value `demucs_onnx.separate_stem` accepts
|
| 153 |
+
for its `providers` kwarg.
|
| 154 |
+
"""
|
| 155 |
+
try:
|
| 156 |
+
if torch.cuda.is_available():
|
| 157 |
+
import onnxruntime as ort # noqa: PLC0415
|
| 158 |
+
# ZeroGPU has no global CUDA install; borrow the CUDA/cuDNN libs that
|
| 159 |
+
# torch already loaded (ORT >= 1.20). Best-effort β older builds find
|
| 160 |
+
# them on the path because torch is imported at module top.
|
| 161 |
+
if hasattr(ort, "preload_dlls"):
|
| 162 |
+
try:
|
| 163 |
+
ort.preload_dlls()
|
| 164 |
+
except Exception as e: # noqa: BLE001
|
| 165 |
+
print(f"WARNING: onnxruntime.preload_dlls() failed: {e}")
|
| 166 |
+
if "CUDAExecutionProvider" in ort.get_available_providers():
|
| 167 |
+
return ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
| 168 |
+
print("WARNING: CUDAExecutionProvider not available to onnxruntime; "
|
| 169 |
+
"isolate_voice will run on CPU. Is onnxruntime-gpu installed?")
|
| 170 |
+
except Exception as e: # noqa: BLE001
|
| 171 |
+
print(f"WARNING: GPU provider selection failed, using CPU: {e}")
|
| 172 |
+
return "cpu"
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def _isolate_voice_impl(audio_path: str) -> str:
|
| 176 |
+
"""Core voice-isolation routine (NOT GPU-decorated).
|
| 177 |
+
|
| 178 |
+
Callable directly from another `@spaces.GPU` function (e.g. the clone path)
|
| 179 |
+
where CUDA is already attached, avoiding a nested GPU allocation. The public
|
| 180 |
+
`isolate_voice` wrapper below adds the GPU scope for the standalone endpoint.
|
| 181 |
"""
|
| 182 |
if not audio_path:
|
| 183 |
return audio_path
|
|
|
|
| 190 |
except Exception: # noqa: BLE001
|
| 191 |
sr = 44100
|
| 192 |
|
| 193 |
+
# htdemucs_ft vocals specialist. Runs on CUDA when available (see
|
| 194 |
+
# _separation_providers); falls back to CPU otherwise.
|
| 195 |
+
providers = _separation_providers()
|
| 196 |
+
print(f"isolate_voice: onnxruntime providers={providers}")
|
| 197 |
+
vocals = separate_stem(audio_path, "vocals", providers=providers) # (channels, samples)
|
| 198 |
vocals = np.asarray(vocals, dtype=np.float32)
|
| 199 |
if vocals.ndim == 2:
|
| 200 |
vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
|
|
|
|
| 211 |
return out_path
|
| 212 |
|
| 213 |
|
| 214 |
+
@spaces.GPU(duration=90)
|
| 215 |
+
def isolate_voice(audio_path: str) -> str:
|
| 216 |
+
"""Return a path to a cleaned WAV with background music/noise removed.
|
| 217 |
+
|
| 218 |
+
GPU-scoped so HT-Demucs runs on CUDA under ZeroGPU (was CPU-bound at ~180s).
|
| 219 |
+
Falls back to the original clip (and warns) if separation is unavailable
|
| 220 |
+
or fails, so cloning never hard-breaks on a cleanup error.
|
| 221 |
+
"""
|
| 222 |
+
return _isolate_voice_impl(audio_path)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
def isolate_voice_ui(audio_path: str):
|
| 226 |
"""UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
|
| 227 |
if not audio_path:
|
|
|
|
| 269 |
if not (clean_reference and ref):
|
| 270 |
return ref
|
| 271 |
try:
|
| 272 |
+
# Call the undecorated core: clone_and_speak is already @spaces.GPU, so
|
| 273 |
+
# CUDA is attached here. Re-entering isolate_voice (also GPU-decorated)
|
| 274 |
+
# would nest GPU allocations.
|
| 275 |
+
return _isolate_voice_impl(ref)
|
| 276 |
except Exception as e: # noqa: BLE001
|
| 277 |
gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
|
| 278 |
return ref
|
requirements.txt
CHANGED
|
@@ -20,6 +20,15 @@ safetensors
|
|
| 20 |
# Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
|
| 21 |
demucs-onnx==0.3.3
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# Optional language-specific normalizers (disabled for build reliability β English-first prototype).
|
| 24 |
# Re-enable only if you need advanced zh / ja / ru text normalization:
|
| 25 |
# spacy_pkuseg # Chinese text segmentation
|
|
|
|
| 20 |
# Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
|
| 21 |
demucs-onnx==0.3.3
|
| 22 |
|
| 23 |
+
# GPU execution provider for ONNX Runtime so HT-Demucs reference-cleaning runs on
|
| 24 |
+
# the ZeroGPU CUDA device (was CPU-bound at ~180s, tripping the bot's voice
|
| 25 |
+
# timeouts). demucs-onnx requires the CPU `onnxruntime`; this adds the CUDA EP
|
| 26 |
+
# alongside it. 1.20.1 = last line with cp310 wheels + CUDA 12 / cuDNN 9 default
|
| 27 |
+
# (matches the ZeroGPU A10G torch stack) and exposes onnxruntime.preload_dlls().
|
| 28 |
+
# Selection is runtime-guarded with a CPU fallback (see app.py _separation_providers),
|
| 29 |
+
# so a missing/mismatched CUDA EP degrades to CPU instead of breaking the Space.
|
| 30 |
+
onnxruntime-gpu==1.20.1
|
| 31 |
+
|
| 32 |
# Optional language-specific normalizers (disabled for build reliability β English-first prototype).
|
| 33 |
# Re-enable only if you need advanced zh / ja / ru text normalization:
|
| 34 |
# spacy_pkuseg # Chinese text segmentation
|