Spaces:

ZeroPointMonkey
/

voice-clone-bench

Paused

App Files Files Community

ZeroPointMonkey commited on 6 days ago

Commit

751f97c

1 Parent(s): 852fbdc

Revert "perf: run HT-Demucs reference cleaning on GPU (ZeroGPU CUDA EP)"

Browse files

This reverts commit 852fbdcded8d8b32402a347e632470a0b50643b8.

Files changed (2) hide show

app.py +11 -65
requirements.txt +0 -9

app.py CHANGED Viewed

@@ -40,8 +40,8 @@ MODEL = None
 #
 # Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
 # critical section so a request owns the model exclusively for its full
-# synthesis. The reference cleaning (HT-Demucs) runs OUTSIDE the lock to keep
-# the exclusive window as short as possible — it touches no shared model state.
 _MODEL_LOCK = threading.Lock()
 # ── Faithful-cloning defaults ────────────────────────────────────────────────
@@ -118,14 +118,8 @@ def set_seed(seed: int):
 # clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
 # SDR) via the pure-numpy + onnxruntime `demucs-onnx` package — no torch/
 # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
-#
-# PERF: HT-Demucs on the Space's ~2-vCPU CPU took up to/over 180s, which
-# tripped the bot's voice timeouts. On ZeroGPU, CUDA is ONLY visible inside a
-# `@spaces.GPU`-scoped call, so the public entrypoint (`isolate_voice`) is now
-# GPU-decorated and runs the ONNX graph on the CUDA execution provider
-# (requires `onnxruntime-gpu`; see requirements.txt). We ALWAYS keep a CPU
-# provider in the list as a fallback, so local/CPU and any CUDA-init failure
-# still produce a clean reference (just slower) instead of hard-breaking.
 _SEPARATOR_READY = None
@@ -142,42 +136,11 @@ def _ensure_separator():
     return _SEPARATOR_READY or None
-def _separation_providers():
-    """Pick ONNX Runtime execution providers for HT-Demucs.
-    Inside a `@spaces.GPU` scope on ZeroGPU, CUDA is available, so we run the
-    separation on the GPU (seconds vs minutes on the 2-vCPU CPU). CPU is always
-    kept as a fallback in the list, so if the CUDA EP can't be created (no
-    onnxruntime-gpu, cuDNN mismatch, or local/CPU host) ORT silently drops to
-    CPU instead of raising. Returns a value `demucs_onnx.separate_stem` accepts
-    for its `providers` kwarg.
-    """
-    try:
-        if torch.cuda.is_available():
-            import onnxruntime as ort  # noqa: PLC0415
-            # ZeroGPU has no global CUDA install; borrow the CUDA/cuDNN libs that
-            # torch already loaded (ORT >= 1.20). Best-effort — older builds find
-            # them on the path because torch is imported at module top.
-            if hasattr(ort, "preload_dlls"):
-                try:
-                    ort.preload_dlls()
-                except Exception as e:  # noqa: BLE001
-                    print(f"WARNING: onnxruntime.preload_dlls() failed: {e}")
-            if "CUDAExecutionProvider" in ort.get_available_providers():
-                return ["CUDAExecutionProvider", "CPUExecutionProvider"]
-            print("WARNING: CUDAExecutionProvider not available to onnxruntime; "
-                  "isolate_voice will run on CPU. Is onnxruntime-gpu installed?")
-    except Exception as e:  # noqa: BLE001
-        print(f"WARNING: GPU provider selection failed, using CPU: {e}")
-    return "cpu"
-def _isolate_voice_impl(audio_path: str) -> str:
-    """Core voice-isolation routine (NOT GPU-decorated).
-    Callable directly from another `@spaces.GPU` function (e.g. the clone path)
-    where CUDA is already attached, avoiding a nested GPU allocation. The public
-    `isolate_voice` wrapper below adds the GPU scope for the standalone endpoint.
     """
     if not audio_path:
         return audio_path
@@ -190,11 +153,8 @@ def _isolate_voice_impl(audio_path: str) -> str:
     except Exception:  # noqa: BLE001
         sr = 44100
-    # htdemucs_ft vocals specialist. Runs on CUDA when available (see
-    # _separation_providers); falls back to CPU otherwise.
-    providers = _separation_providers()
-    print(f"isolate_voice: onnxruntime providers={providers}")
-    vocals = separate_stem(audio_path, "vocals", providers=providers)  # (channels, samples)
     vocals = np.asarray(vocals, dtype=np.float32)
     if vocals.ndim == 2:
         vocals = vocals.mean(axis=0)  # downmix to mono for the speaker encoder
@@ -211,17 +171,6 @@ def _isolate_voice_impl(audio_path: str) -> str:
     return out_path
-@spaces.GPU(duration=90)
-def isolate_voice(audio_path: str) -> str:
-    """Return a path to a cleaned WAV with background music/noise removed.
-    GPU-scoped so HT-Demucs runs on CUDA under ZeroGPU (was CPU-bound at ~180s).
-    Falls back to the original clip (and warns) if separation is unavailable
-    or fails, so cloning never hard-breaks on a cleanup error.
-    """
-    return _isolate_voice_impl(audio_path)
 def isolate_voice_ui(audio_path: str):
     """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
     if not audio_path:
@@ -269,10 +218,7 @@ def _maybe_clean_reference(ref: str, clean_reference: bool) -> str:
     if not (clean_reference and ref):
         return ref
     try:
-        # Call the undecorated core: clone_and_speak is already @spaces.GPU, so
-        # CUDA is attached here. Re-entering isolate_voice (also GPU-decorated)
-        # would nest GPU allocations.
-        return _isolate_voice_impl(ref)
     except Exception as e:  # noqa: BLE001
         gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
         return ref

 #
 # Fix: hold this lock for the ENTIRE set-reference -> generate-all-chunks
 # critical section so a request owns the model exclusively for its full
+# synthesis. The (CPU, GPU-budget-free) reference cleaning runs OUTSIDE the
+# lock to keep the exclusive window as short as possible.
 _MODEL_LOCK = threading.Lock()
 # ── Faithful-cloning defaults ────────────────────────────────────────────────
 # clean speech. Uses HT-Demucs (htdemucs_ft vocals stem, #1 open-source vocal
 # SDR) via the pure-numpy + onnxruntime `demucs-onnx` package — no torch/
 # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
+# Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
+# member of a future "audio cleanup" feature group (denoise, trim, normalize…).
 _SEPARATOR_READY = None
     return _SEPARATOR_READY or None
+def isolate_voice(audio_path: str) -> str:
+    """Return a path to a cleaned WAV with background music/noise removed.
+    Falls back to the original clip (and warns) if separation is unavailable
+    or fails, so cloning never hard-breaks on a cleanup error.
     """
     if not audio_path:
         return audio_path
     except Exception:  # noqa: BLE001
         sr = 44100
+    # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
+    vocals = separate_stem(audio_path, "vocals", providers="cpu")  # (channels, samples)
     vocals = np.asarray(vocals, dtype=np.float32)
     if vocals.ndim == 2:
         vocals = vocals.mean(axis=0)  # downmix to mono for the speaker encoder
     return out_path
 def isolate_voice_ui(audio_path: str):
     """UI/endpoint wrapper: preview the cleaned reference (api_name=/isolate_voice)."""
     if not audio_path:
     if not (clean_reference and ref):
         return ref
     try:
+        return isolate_voice(ref)
     except Exception as e:  # noqa: BLE001
         gr.Warning(f"Background-audio removal failed, using raw reference: {e}")
         return ref

requirements.txt CHANGED Viewed

@@ -20,15 +20,6 @@ safetensors
 # Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
 demucs-onnx==0.3.3
-# GPU execution provider for ONNX Runtime so HT-Demucs reference-cleaning runs on
-# the ZeroGPU CUDA device (was CPU-bound at ~180s, tripping the bot's voice
-# timeouts). demucs-onnx requires the CPU `onnxruntime`; this adds the CUDA EP
-# alongside it. 1.20.1 = last line with cp310 wheels + CUDA 12 / cuDNN 9 default
-# (matches the ZeroGPU A10G torch stack) and exposes onnxruntime.preload_dlls().
-# Selection is runtime-guarded with a CPU fallback (see app.py _separation_providers),
-# so a missing/mismatched CUDA EP degrades to CPU instead of breaking the Space.
-onnxruntime-gpu==1.20.1
 # Optional language-specific normalizers (disabled for build reliability — English-first prototype).
 # Re-enable only if you need advanced zh / ja / ru text normalization:
 # spacy_pkuseg          # Chinese text segmentation

 # Pinned to 0.3.3 (last release supporting the Space's Python 3.10; 0.3.4 needs 3.11+).
 demucs-onnx==0.3.3
 # Optional language-specific normalizers (disabled for build reliability — English-first prototype).
 # Re-enable only if you need advanced zh / ja / ru text normalization:
 # spacy_pkuseg          # Chinese text segmentation