ZeroPointMonkey Cursor commited on
Commit
7dbfffd
·
1 Parent(s): 751f97c

perf(stopgap): trim reference to 10s before Demucs cleaning to bound CPU time

Browse files
Files changed (1) hide show
  1. app.py +36 -1
app.py CHANGED
@@ -120,7 +120,15 @@ def set_seed(seed: int):
120
  # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
121
  # Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
122
  # member of a future "audio cleanup" feature group (denoise, trim, normalize…).
 
 
 
 
 
 
 
123
  _SEPARATOR_READY = None
 
124
 
125
 
126
  def _ensure_separator():
@@ -153,8 +161,35 @@ def isolate_voice(audio_path: str) -> str:
153
  except Exception: # noqa: BLE001
154
  sr = 44100
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
157
- vocals = separate_stem(audio_path, "vocals", providers="cpu") # (channels, samples)
 
 
 
 
 
 
 
158
  vocals = np.asarray(vocals, dtype=np.float32)
159
  if vocals.ndim == 2:
160
  vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder
 
120
  # torchaudio dependency, so it can't disturb the pinned Chatterbox stack.
121
  # Runs on CPU so it does NOT consume the ZeroGPU budget. Designed as the first
122
  # member of a future "audio cleanup" feature group (denoise, trim, normalize…).
123
+ #
124
+ # STOPGAP — bound CPU separation time. demucs-onnx runtime scales ~linearly with
125
+ # clip length; on long references it ran ~180s and blew the bot's voice timeout.
126
+ # Speaker conditioning only needs a few seconds of clean speech, so we trim the
127
+ # reference to a short leading slice BEFORE separation. This caps CPU work to
128
+ # ~30-40s regardless of input length while keeping clone quality (the conditioner
129
+ # never used more than the leading seconds anyway).
130
  _SEPARATOR_READY = None
131
+ _CLEAN_TRIM_SECONDS = 10.0
132
 
133
 
134
  def _ensure_separator():
 
161
  except Exception: # noqa: BLE001
162
  sr = 44100
163
 
164
+ # STOPGAP: trim long references to a short leading slice so CPU separation
165
+ # time is bounded (Demucs runtime ~linear in clip length). The speaker
166
+ # conditioner only needs a few seconds of clean speech. We separate the
167
+ # trimmed slice; if anything in the trim path fails we fall back to the
168
+ # full clip so cleaning never hard-breaks.
169
+ sep_input = audio_path
170
+ trim_path = None
171
+ try:
172
+ info = sf.info(audio_path)
173
+ max_frames = int(_CLEAN_TRIM_SECONDS * info.samplerate)
174
+ if info.frames > max_frames:
175
+ data, file_sr = sf.read(audio_path, frames=max_frames, dtype="float32")
176
+ trim_path = os.path.join(tempfile.gettempdir(), f"cleantrim_{uuid.uuid4().hex}.wav")
177
+ sf.write(trim_path, data, file_sr)
178
+ sep_input = trim_path
179
+ print(f"Trimmed reference for cleaning: {info.frames/info.samplerate:.1f}s -> {_CLEAN_TRIM_SECONDS:.1f}s")
180
+ except Exception as e: # noqa: BLE001
181
+ print(f"WARNING: reference trim failed, separating full clip: {e}")
182
+ sep_input = audio_path
183
+
184
  # htdemucs_ft vocals specialist (CPU keeps this off the ZeroGPU budget).
185
+ try:
186
+ vocals = separate_stem(sep_input, "vocals", providers="cpu") # (channels, samples)
187
+ finally:
188
+ if trim_path and os.path.exists(trim_path):
189
+ try:
190
+ os.remove(trim_path)
191
+ except OSError:
192
+ pass
193
  vocals = np.asarray(vocals, dtype=np.float32)
194
  if vocals.ndim == 2:
195
  vocals = vocals.mean(axis=0) # downmix to mono for the speaker encoder