Commit Β·
751f97c
1
Parent(s): 852fbdc
Revert "perf: run HT-Demucs reference cleaning on GPU (ZeroGPU CUDA EP)"
Browse filesThis reverts commit 852fbdcded8d8b32402a347e632470a0b50643b8.
- app.py +11 -65
- requirements.txt +0 -9
app.py
CHANGED
|
@@ -40,8 +40,8 @@ MODEL = None
|
|
| 40 |
#
|
| 41 |
# Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
|
| 42 |
# critical section so a request owns the model exclusively for its full
|
| 43 |
-
# synthesis. The
|
| 44 |
-
# the exclusive window as short as possible
|
| 45 |
_MODEL_LOCK = threading.Lock()
|
| 46 |
|
| 47 |
# ββ Faithful-cloning defaults ββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -118,14 +118,8 @@ def set_seed(seed: int):
|
|
| 118 |
# clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
|
| 119 |
# SDR) via the pure-numpy + onnxruntime `demucs-onnx` package β no torch/
|
| 120 |
# torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
|
| 121 |
-
#
|
| 122 |
-
#
|
| 123 |
-
# tripped the bot's voice timeouts. On ZeroGPU, CUDA is ONLY visible inside a
|
| 124 |
-
# `@spaces.GPU`-scoped call, so the public entrypoint (`isolate_voice`) is now
|
| 125 |
-
# GPU-decorated and runs the ONNX graph on the CUDA execution provider
|
| 126 |
-
# (requires `onnxruntime-gpu`; see requirements.txt). We ALWAYS keep a CPU
|
| 127 |
-
# provider in the list as a fallback, so local/CPU and any CUDA-init failure
|
| 128 |
-
# still produce a clean reference (just slower) instead of hard-breaking.
|
| 129 |
_SEPARATOR_READY = None
|
| 130 |
|
| 131 |
|
|
@@ -142,42 +136,11 @@ def _ensure_separator():
|
|
| 142 |
return _SEPARATOR_READY or None
|
| 143 |
|
| 144 |
|
| 145 |
-
def
|
| 146 |
-
"""
|
| 147 |
-
|
| 148 |
-
Inside a `@spaces.GPU` scope on ZeroGPU, CUDA is available, so we run the
|
| 149 |
-
separation on the GPU (seconds vs minutes on the 2-vCPU CPU). CPU is always
|
| 150 |
-
kept as a fallback in the list, so if the CUDA EP can't be created (no
|
| 151 |
-
onnxruntime-gpu, cuDNN mismatch, or local/CPU host) ORT silently drops to
|
| 152 |
-
CPU instead of raising. Returns a value `demucs_onnx.separate_stem` accepts
|
| 153 |
-
for its `providers` kwarg.
|
| 154 |
-
"""
|
| 155 |
-
try:
|
| 156 |
-
if torch.cuda.is_available():
|
| 157 |
-
import onnxruntime as ort # noqa: PLC0415
|
| 158 |
-
# ZeroGPU has no global CUDA install; borrow the CUDA/cuDNN libs that
|
| 159 |
-
# torch already loaded (ORT >= 1.20). Best-effort β older builds find
|
| 160 |
-
# them on the path because torch is imported at module top.
|
| 161 |
-
if hasattr(ort, "preload_dlls"):
|
| 162 |
-
try:
|
| 163 |
-
ort.preload_dlls()
|
| 164 |
-
except Exception as e: # noqa: BLE001
|
| 165 |
-
print(f"WARNING: onnxruntime.preload_dlls() failed: {e}")
|
| 166 |
-
if "CUDAExecutionProvider" in ort.get_available_providers():
|
| 167 |
-
return ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
| 168 |
-
print("WARNING: CUDAExecutionProvider not available to onnxruntime; "
|
| 169 |
-
"isolate_voice will run on CPU. Is onnxruntime-gpu installed?")
|
| 170 |
-
except Exception as e: # noqa: BLE001
|
| 171 |
-
print(f"WARNING: GPU provider selection failed, using CPU: {e}")
|
| 172 |
-
return "cpu"
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
def _isolate_voice_impl(audio_path: str) -> str:
|
| 176 |
-
"""Core voice-isolation routine (NOT GPU-decorated).
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
`isolate_voice` wrapper below adds the GPU scope for the standalone endpoint.
|
| 181 |
"""
|
| 182 |
if not audio_path:
|
| 183 |
return audio_path
|
|
@@ -190,11 +153,8 @@ def _isolate_voice_impl(audio_path: str) -> str:
|
|
| 190 |
except Exception: # noqa: BLE001
|
| 191 |
sr = 44100
|
| 192 |
|
| 193 |
-
# htdemucs_ft vocals specialist
|
| 194 |
-
|
| 195 |
-
providers = _separation_providers()
|
| 196 |
-
print(f"isolate_voice: onnxruntime providers={providers}")
|
| 197 |
-
vocals = separate_stem(audio_path, "vocals", providers=providers) # (channels, samples)
|
| 198 |
vocals = np.asarray(vocals, dtype=np.float32)
|
| 199 |
if vocals.ndim == 2:
|
| 200 |
vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
|
|
@@ -211,17 +171,6 @@ def _isolate_voice_impl(audio_path: str) -> str:
|
|
| 211 |
return out_path
|
| 212 |
|
| 213 |
|
| 214 |
-
@spaces.GPU(duration=90)
|
| 215 |
-
def isolate_voice(audio_path: str) -> str:
|
| 216 |
-
"""Return a path to a cleaned WAV with background music/noise removed.
|
| 217 |
-
|
| 218 |
-
GPU-scoped so HT-Demucs runs on CUDA under ZeroGPU (was CPU-bound at ~180s).
|
| 219 |
-
Falls back to the original clip (and warns) if separation is unavailable
|
| 220 |
-
or fails, so cloning never hard-breaks on a cleanup error.
|
| 221 |
-
"""
|
| 222 |
-
return _isolate_voice_impl(audio_path)
|
| 223 |
-
|
| 224 |
-
|
| 225 |
def isolate_voice_ui(audio_path: str):
|
| 226 |
"""UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
|
| 227 |
if not audio_path:
|
|
@@ -269,10 +218,7 @@ def _maybe_clean_reference(ref: str, clean_reference: bool) -> str:
|
|
| 269 |
if not (clean_reference and ref):
|
| 270 |
return ref
|
| 271 |
try:
|
| 272 |
-
|
| 273 |
-
# CUDA is attached here. Re-entering isolate_voice (also GPU-decorated)
|
| 274 |
-
# would nest GPU allocations.
|
| 275 |
-
return _isolate_voice_impl(ref)
|
| 276 |
except Exception as e: # noqa: BLE001
|
| 277 |
gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
|
| 278 |
return ref
|
|
|
|
| 40 |
#
|
| 41 |
# Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
|
| 42 |
# critical section so a request owns the model exclusively for its full
|
| 43 |
+
# synthesis. The (CPU, GPU-budget-free) reference cleaning runs OUTSIDE the
|
| 44 |
+
# lock to keep the exclusive window as short as possible.
|
| 45 |
_MODEL_LOCK = threading.Lock()
|
| 46 |
|
| 47 |
# ββ Faithful-cloning defaults ββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 118 |
# clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
|
| 119 |
# SDR) via the pure-numpy + onnxruntime `demucs-onnx` package β no torch/
|
| 120 |
# torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
|
| 121 |
+
# Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
|
| 122 |
+
# member of a future "audio cleanup" feature group (denoise, trim, normalizeβ¦).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
_SEPARATOR_READY = None
|
| 124 |
|
| 125 |
|
|
|
|
| 136 |
return _SEPARATOR_READY or None
|
| 137 |
|
| 138 |
|
| 139 |
+
def isolate_voice(audio_path: str) -> str:
|
| 140 |
+
"""Return a path to a cleaned WAV with background music/noise removed.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
+
Falls back to the original clip (and warns) if separation is unavailable
|
| 143 |
+
or fails, so cloning never hard-breaks on a cleanup error.
|
|
|
|
| 144 |
"""
|
| 145 |
if not audio_path:
|
| 146 |
return audio_path
|
|
|
|
| 153 |
except Exception: # noqa: BLE001
|
| 154 |
sr = 44100
|
| 155 |
|
| 156 |
+
# htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
|
| 157 |
+
vocals = separate_stem(audio_path, "vocals", providers="cpu") # (channels, samples)
|
|
|
|
|
|
|
|
|
|
| 158 |
vocals = np.asarray(vocals, dtype=np.float32)
|
| 159 |
if vocals.ndim == 2:
|
| 160 |
vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
|
|
|
|
| 171 |
return out_path
|
| 172 |
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
def isolate_voice_ui(audio_path: str):
|
| 175 |
"""UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
|
| 176 |
if not audio_path:
|
|
|
|
| 218 |
if not (clean_reference and ref):
|
| 219 |
return ref
|
| 220 |
try:
|
| 221 |
+
return isolate_voice(ref)
|
|
|
|
|
|
|
|
|
|
| 222 |
except Exception as e: # noqa: BLE001
|
| 223 |
gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
|
| 224 |
return ref
|
requirements.txt
CHANGED
|
@@ -20,15 +20,6 @@ safetensors
|
|
| 20 |
# Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
|
| 21 |
demucs-onnx==0.3.3
|
| 22 |
|
| 23 |
-
# GPU execution provider for ONNX Runtime so HT-Demucs reference-cleaning runs on
|
| 24 |
-
# the ZeroGPU CUDA device (was CPU-bound at ~180s, tripping the bot's voice
|
| 25 |
-
# timeouts). demucs-onnx requires the CPU `onnxruntime`; this adds the CUDA EP
|
| 26 |
-
# alongside it. 1.20.1 = last line with cp310 wheels + CUDA 12 / cuDNN 9 default
|
| 27 |
-
# (matches the ZeroGPU A10G torch stack) and exposes onnxruntime.preload_dlls().
|
| 28 |
-
# Selection is runtime-guarded with a CPU fallback (see app.py _separation_providers),
|
| 29 |
-
# so a missing/mismatched CUDA EP degrades to CPU instead of breaking the Space.
|
| 30 |
-
onnxruntime-gpu==1.20.1
|
| 31 |
-
|
| 32 |
# Optional language-specific normalizers (disabled for build reliability β English-first prototype).
|
| 33 |
# Re-enable only if you need advanced zh / ja / ru text normalization:
|
| 34 |
# spacy_pkuseg # Chinese text segmentation
|
|
|
|
| 20 |
# Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
|
| 21 |
demucs-onnx==0.3.3
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# Optional language-specific normalizers (disabled for build reliability β English-first prototype).
|
| 24 |
# Re-enable only if you need advanced zh / ja / ru text normalization:
|
| 25 |
# spacy_pkuseg # Chinese text segmentation
|