Spaces:

ZeroPointMonkey
/

voice-clone-bench

Paused

App Files Files Community

ZeroPointMonkey Cursor commited on 6 days ago

Commit

7dbfffd

1 Parent(s): 751f97c

perf(stopgap): trim reference to 10s before Demucs cleaning to bound CPU time

Browse files

Files changed (1) hide show

app.py +36 -1

app.py CHANGED Viewed

@@ -120,7 +120,15 @@ def set_seed(seed: int):
 # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
 # Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
 # member of a future "audio cleanup" feature group (denoise, trim, normalize…).
 _SEPARATOR_READY = None
 def _ensure_separator():
@@ -153,8 +161,35 @@ def isolate_voice(audio_path: str) -> str:
     except Exception:  # noqa: BLE001
         sr = 44100
     # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
-    vocals = separate_stem(audio_path, "vocals", providers="cpu")  # (channels, samples)
     vocals = np.asarray(vocals, dtype=np.float32)
     if vocals.ndim == 2:
         vocals = vocals.mean(axis=0)  # downmix to mono for the speaker encoder

 # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
 # Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
 # member of a future "audio cleanup" feature group (denoise, trim, normalize…).
+#
+# STOPGAP — bound CPU separation time. demucs-onnx runtime scales ~linearly with
+# clip length; on long references it ran ~180s and blew the bot's voice timeout.
+# Speaker conditioning only needs a few seconds of clean speech, so we trim the
+# reference to a short leading slice BEFORE separation. This caps CPU work to
+# ~30-40s regardless of input length while keeping clone quality (the conditioner
+# never used more than the leading seconds anyway).
 _SEPARATOR_READY = None
+_CLEAN_TRIM_SECONDS = 10.0
 def _ensure_separator():
     except Exception:  # noqa: BLE001
         sr = 44100
+    # STOPGAP: trim long references to a short leading slice so CPU separation
+    # time is bounded (Demucs runtime ~linear in clip length). The speaker
+    # conditioner only needs a few seconds of clean speech. We separate the
+    # trimmed slice; if anything in the trim path fails we fall back to the
+    # full clip so cleaning never hard-breaks.
+    sep_input = audio_path
+    trim_path = None
+    try:
+        info = sf.info(audio_path)
+        max_frames = int(_CLEAN_TRIM_SECONDS * info.samplerate)
+        if info.frames > max_frames:
+            data, file_sr = sf.read(audio_path, frames=max_frames, dtype="float32")
+            trim_path = os.path.join(tempfile.gettempdir(), f"cleantrim_{uuid.uuid4().hex}.wav")
+            sf.write(trim_path, data, file_sr)
+            sep_input = trim_path
+            print(f"Trimmed reference for cleaning: {info.frames/info.samplerate:.1f}s -> {_CLEAN_TRIM_SECONDS:.1f}s")
+    except Exception as e:  # noqa: BLE001
+        print(f"WARNING: reference trim failed, separating full clip: {e}")
+        sep_input = audio_path
     # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
+    try:
+        vocals = separate_stem(sep_input, "vocals", providers="cpu")  # (channels, samples)
+    finally:
+        if trim_path and os.path.exists(trim_path):
+            try:
+                os.remove(trim_path)
+            except OSError:
+                pass
     vocals = np.asarray(vocals, dtype=np.float32)
     if vocals.ndim == 2:
         vocals = vocals.mean(axis=0)  # downmix to mono for the speaker encoder