Spaces:

ZeroPointMonkey
/

voice-clone-bench

Paused

ZeroPointMonkey commited on 6 days ago

Commit

852fbdc

1 Parent(s): 4a26525

perf: run HT-Demucs reference cleaning on GPU (ZeroGPU CUDA EP)

isolate_voice was CPU-bound at ~180s on the 2-vCPU Space, tripping the bot's voice timeouts. Decorate it with @spaces.GPU and select the CUDA onnxruntime execution provider inside the GPU scope (onnxruntime-gpu added), with a CPU fallback so local/CPU and any CUDA-init failure still work. The clone path calls the undecorated core to avoid nested GPU allocation. _MODEL_LOCK leak-fix and the /clone + /isolate_voice API signatures are unchanged.

Files changed (2) hide show

app.py +65 -11
requirements.txt +9 -0

app.py CHANGED Viewed

@@ -40,8 +40,8 @@ MODEL = None
 #
 # Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
 # critical section so a request owns the model exclusively for its full
-# synthesis. The (CPU, GPU-budget-free) reference cleaning runs OUTSIDE the
-# lock to keep the exclusive window as short as possible.
 _MODEL_LOCK = threading.Lock()
 # ── Faithful-cloning defaults ────────────────────────────────────────────────
@@ -118,8 +118,14 @@ def set_seed(seed: int):
 # clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
 # SDR) via the pure-numpy + onnxruntime `demucs-onnx` package — no torch/
 # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
-# Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
-# member of a future "audio cleanup" feature group (denoise, trim, normalize…).
 _SEPARATOR_READY = None
@@ -136,11 +142,42 @@ def _ensure_separator():
     return _SEPARATOR_READY or None
-def isolate_voice(audio_path: str) -> str:
-    """Return a path to a cleaned WAV with background music/noise removed.
-    Falls back to the original clip (and warns) if separation is unavailable
-    or fails, so cloning never hard-breaks on a cleanup error.
     """
     if not audio_path:
         return audio_path
@@ -153,8 +190,11 @@ def isolate_voice(audio_path: str) -> str:
     except Exception:  # noqa: BLE001
         sr = 44100
-    # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
-    vocals = separate_stem(audio_path, "vocals", providers="cpu")  # (channels, samples)
     vocals = np.asarray(vocals, dtype=np.float32)
     if vocals.ndim == 2:
         vocals = vocals.mean(axis=0)  # downmix to mono for the speaker encoder
@@ -171,6 +211,17 @@ def isolate_voice(audio_path: str) -> str:
     return out_path
 def isolate_voice_ui(audio_path: str):
     """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
     if not audio_path:
@@ -218,7 +269,10 @@ def _maybe_clean_reference(ref: str, clean_reference: bool) -> str:
     if not (clean_reference and ref):
         return ref
     try:
-        return isolate_voice(ref)
     except Exception as e:  # noqa: BLE001
         gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
         return ref

 #
 # Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
 # critical section so a request owns the model exclusively for its full
+# synthesis. The reference cleaning (HT-Demucs) runs OUTSIDE the lock to keep
+# the exclusive window as short as possible — it touches no shared model state.
 _MODEL_LOCK = threading.Lock()
 # ── Faithful-cloning defaults ────────────────────────────────────────────────
 # clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
 # SDR) via the pure-numpy + onnxruntime `demucs-onnx` package — no torch/
 # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
+#
+# PERF: HT-Demucs on the Space's ~2-vCPU CPU took up to/over 180s, which
+# tripped the bot's voice timeouts. On ZeroGPU, CUDA is ONLY visible inside a
+# `@spaces.GPU`-scoped call, so the public entrypoint (`isolate_voice`) is now
+# GPU-decorated and runs the ONNX graph on the CUDA execution provider
+# (requires `onnxruntime-gpu`; see requirements.txt). We ALWAYS keep a CPU
+# provider in the list as a fallback, so local/CPU and any CUDA-init failure
+# still produce a clean reference (just slower) instead of hard-breaking.
 _SEPARATOR_READY = None
     return _SEPARATOR_READY or None
+def _separation_providers():
+    """Pick ONNX Runtime execution providers for HT-Demucs.
+    Inside a `@spaces.GPU` scope on ZeroGPU, CUDA is available, so we run the
+    separation on the GPU (seconds vs minutes on the 2-vCPU CPU). CPU is always
+    kept as a fallback in the list, so if the CUDA EP can't be created (no
+    onnxruntime-gpu, cuDNN mismatch, or local/CPU host) ORT silently drops to
+    CPU instead of raising. Returns a value `demucs_onnx.separate_stem` accepts
+    for its `providers` kwarg.
+    """
+    try:
+        if torch.cuda.is_available():
+            import onnxruntime as ort  # noqa: PLC0415
+            # ZeroGPU has no global CUDA install; borrow the CUDA/cuDNN libs that
+            # torch already loaded (ORT >= 1.20). Best-effort — older builds find
+            # them on the path because torch is imported at module top.
+            if hasattr(ort, "preload_dlls"):
+                try:
+                    ort.preload_dlls()
+                except Exception as e:  # noqa: BLE001
+                    print(f"WARNING: onnxruntime.preload_dlls() failed: {e}")
+            if "CUDAExecutionProvider" in ort.get_available_providers():
+                return ["CUDAExecutionProvider", "CPUExecutionProvider"]
+            print("WARNING: CUDAExecutionProvider not available to onnxruntime; "
+                  "isolate_voice will run on CPU. Is onnxruntime-gpu installed?")
+    except Exception as e:  # noqa: BLE001
+        print(f"WARNING: GPU provider selection failed, using CPU: {e}")
+    return "cpu"
+def _isolate_voice_impl(audio_path: str) -> str:
+    """Core voice-isolation routine (NOT GPU-decorated).
+    Callable directly from another `@spaces.GPU` function (e.g. the clone path)
+    where CUDA is already attached, avoiding a nested GPU allocation. The public
+    `isolate_voice` wrapper below adds the GPU scope for the standalone endpoint.
     """
     if not audio_path:
         return audio_path
     except Exception:  # noqa: BLE001
         sr = 44100
+    # htdemucs_ft vocals specialist. Runs on CUDA when available (see
+    # _separation_providers); falls back to CPU otherwise.
+    providers = _separation_providers()
+    print(f"isolate_voice: onnxruntime providers={providers}")
+    vocals = separate_stem(audio_path, "vocals", providers=providers)  # (channels, samples)
     vocals = np.asarray(vocals, dtype=np.float32)
     if vocals.ndim == 2:
         vocals = vocals.mean(axis=0)  # downmix to mono for the speaker encoder
     return out_path
+@spaces.GPU(duration=90)
+def isolate_voice(audio_path: str) -> str:
+    """Return a path to a cleaned WAV with background music/noise removed.
+    GPU-scoped so HT-Demucs runs on CUDA under ZeroGPU (was CPU-bound at ~180s).
+    Falls back to the original clip (and warns) if separation is unavailable
+    or fails, so cloning never hard-breaks on a cleanup error.
+    """
+    return _isolate_voice_impl(audio_path)
 def isolate_voice_ui(audio_path: str):
     """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
     if not audio_path:
     if not (clean_reference and ref):
         return ref
     try:
+        # Call the undecorated core: clone_and_speak is already @spaces.GPU, so
+        # CUDA is attached here. Re-entering isolate_voice (also GPU-decorated)
+        # would nest GPU allocations.
+        return _isolate_voice_impl(ref)
     except Exception as e:  # noqa: BLE001
         gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
         return ref

requirements.txt CHANGED Viewed

@@ -20,6 +20,15 @@ safetensors
 # Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
 demucs-onnx==0.3.3
 # Optional language-specific normalizers (disabled for build reliability — English-first prototype).
 # Re-enable only if you need advanced zh / ja / ru text normalization:
 # spacy_pkuseg          # Chinese text segmentation

 # Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
 demucs-onnx==0.3.3
+# GPU execution provider for ONNX Runtime so HT-Demucs reference-cleaning runs on
+# the ZeroGPU CUDA device (was CPU-bound at ~180s, tripping the bot's voice
+# timeouts). demucs-onnx requires the CPU `onnxruntime`; this adds the CUDA EP
+# alongside it. 1.20.1 = last line with cp310 wheels + CUDA 12 / cuDNN 9 default
+# (matches the ZeroGPU A10G torch stack) and exposes onnxruntime.preload_dlls().
+# Selection is runtime-guarded with a CPU fallback (see app.py _separation_providers),
+# so a missing/mismatched CUDA EP degrades to CPU instead of breaking the Space.
+onnxruntime-gpu==1.20.1
 # Optional language-specific normalizers (disabled for build reliability — English-first prototype).
 # Re-enable only if you need advanced zh / ja / ru text normalization:
 # spacy_pkuseg          # Chinese text segmentation