Commit ·
7dbfffd
1
Parent(s): 751f97c
perf(stopgap): trim reference to 10s before Demucs cleaning to bound CPU time
Browse files
app.py
CHANGED
|
@@ -120,7 +120,15 @@ def set_seed(seed: int):
|
|
| 120 |
# torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
|
| 121 |
# Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
|
| 122 |
# member of a future "audio cleanup" feature group (denoise, trim, normalize…).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
_SEPARATOR_READY = None
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
def _ensure_separator():
|
|
@@ -153,8 +161,35 @@ def isolate_voice(audio_path: str) -> str:
|
|
| 153 |
except Exception: # noqa: BLE001
|
| 154 |
sr = 44100
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
# htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
vocals = np.asarray(vocals, dtype=np.float32)
|
| 159 |
if vocals.ndim == 2:
|
| 160 |
vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
|
|
|
|
| 120 |
# torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
|
| 121 |
# Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
|
| 122 |
# member of a future "audio cleanup" feature group (denoise, trim, normalize…).
|
| 123 |
+
#
|
| 124 |
+
# STOPGAP — bound CPU separation time. demucs-onnx runtime scales ~linearly with
|
| 125 |
+
# clip length; on long references it ran ~180s and blew the bot's voice timeout.
|
| 126 |
+
# Speaker conditioning only needs a few seconds of clean speech, so we trim the
|
| 127 |
+
# reference to a short leading slice BEFORE separation. This caps CPU work to
|
| 128 |
+
# ~30-40s regardless of input length while keeping clone quality (the conditioner
|
| 129 |
+
# never used more than the leading seconds anyway).
|
| 130 |
_SEPARATOR_READY = None
|
| 131 |
+
_CLEAN_TRIM_SECONDS = 10.0
|
| 132 |
|
| 133 |
|
| 134 |
def _ensure_separator():
|
|
|
|
| 161 |
except Exception: # noqa: BLE001
|
| 162 |
sr = 44100
|
| 163 |
|
| 164 |
+
# STOPGAP: trim long references to a short leading slice so CPU separation
|
| 165 |
+
# time is bounded (Demucs runtime ~linear in clip length). The speaker
|
| 166 |
+
# conditioner only needs a few seconds of clean speech. We separate the
|
| 167 |
+
# trimmed slice; if anything in the trim path fails we fall back to the
|
| 168 |
+
# full clip so cleaning never hard-breaks.
|
| 169 |
+
sep_input = audio_path
|
| 170 |
+
trim_path = None
|
| 171 |
+
try:
|
| 172 |
+
info = sf.info(audio_path)
|
| 173 |
+
max_frames = int(_CLEAN_TRIM_SECONDS * info.samplerate)
|
| 174 |
+
if info.frames > max_frames:
|
| 175 |
+
data, file_sr = sf.read(audio_path, frames=max_frames, dtype="float32")
|
| 176 |
+
trim_path = os.path.join(tempfile.gettempdir(), f"cleantrim_{uuid.uuid4().hex}.wav")
|
| 177 |
+
sf.write(trim_path, data, file_sr)
|
| 178 |
+
sep_input = trim_path
|
| 179 |
+
print(f"Trimmed reference for cleaning: {info.frames/info.samplerate:.1f}s -> {_CLEAN_TRIM_SECONDS:.1f}s")
|
| 180 |
+
except Exception as e: # noqa: BLE001
|
| 181 |
+
print(f"WARNING: reference trim failed, separating full clip: {e}")
|
| 182 |
+
sep_input = audio_path
|
| 183 |
+
|
| 184 |
# htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
|
| 185 |
+
try:
|
| 186 |
+
vocals = separate_stem(sep_input, "vocals", providers="cpu") # (channels, samples)
|
| 187 |
+
finally:
|
| 188 |
+
if trim_path and os.path.exists(trim_path):
|
| 189 |
+
try:
|
| 190 |
+
os.remove(trim_path)
|
| 191 |
+
except OSError:
|
| 192 |
+
pass
|
| 193 |
vocals = np.asarray(vocals, dtype=np.float32)
|
| 194 |
if vocals.ndim == 2:
|
| 195 |
vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
|