BlueV2

Running

App Files Files Community

notmax123 commited on Apr 25

Commit

12a2ba3

1 Parent(s): cdaa0f4

Post-process TTS: gentle RMS boost; drop vocoder edge trim

Browse files

Files changed (1) hide show

app.py +24 -4

app.py CHANGED Viewed

@@ -526,9 +526,6 @@ class BlueTTS:
                 xt, *_ = self.vector_est_ort.run(None, cond)
         wav, *_ = self.vocoder_ort.run(None, {self._vocoder_input_name: xt})
-        frame_len = self.base_chunk_size * self.chunk_compress_factor
-        if wav.shape[-1] > 2 * frame_len:
-            wav = wav[..., frame_len:-frame_len]
         if wav.ndim == 3 and wav.shape[1] == 1:
             wav = wav[:, 0, :]
         return wav, dur
@@ -608,6 +605,29 @@ def expand_numbers(text: str, lang: str = "en") -> str:
         return text
 # Cache of styles derived from uploaded reference WAVs, keyed by file hash.
 _REF_WAV_CACHE: Dict[str, Style] = {}
@@ -740,7 +760,7 @@ def synthesize_text(text: str, voice: str, lang: str, steps: int, speed: float,
         expand_numbers(text, lang=lang), lang=lang, style=style,
         total_step=int(steps), speed=float(speed), cfg_scale=float(cfg_scale),
     )
-    wav = np.asarray(wav).squeeze()
     proc_time = time.time() - t0
     audio_dur = len(wav) / sr if len(wav) > 0 else 0.0
     rtf = proc_time / audio_dur if audio_dur > 0 else 0

                 xt, *_ = self.vector_est_ort.run(None, cond)
         wav, *_ = self.vocoder_ort.run(None, {self._vocoder_input_name: xt})
         if wav.ndim == 3 and wav.shape[1] == 1:
             wav = wav[:, 0, :]
         return wav, dur
         return text
+def normalize_generated_audio(wav: np.ndarray, target_rms: float = 0.08, peak_limit: float = 0.95) -> np.ndarray:
+    """Gently lift quiet generations while leaving normal/loud audio unclipped."""
+    wav = np.asarray(wav, dtype=np.float32)
+    if wav.size == 0 or not np.isfinite(wav).all():
+        return wav
+    peak = float(np.max(np.abs(wav)))
+    if peak < 1e-6:
+        return wav
+    active = np.abs(wav) > max(peak * 0.02, 1e-4)
+    samples = wav[active] if np.any(active) else wav
+    rms = float(np.sqrt(np.mean(np.square(samples))))
+    if rms < 1e-6:
+        return wav
+    # Cap boost so a very quiet/bad generation does not become harsh or noisy.
+    gain = min(target_rms / rms, peak_limit / peak, 4.0)
+    if gain <= 1.0:
+        return wav
+    return (wav * gain).astype(np.float32)
 # Cache of styles derived from uploaded reference WAVs, keyed by file hash.
 _REF_WAV_CACHE: Dict[str, Style] = {}
         expand_numbers(text, lang=lang), lang=lang, style=style,
         total_step=int(steps), speed=float(speed), cfg_scale=float(cfg_scale),
     )
+    wav = normalize_generated_audio(np.asarray(wav).squeeze())
     proc_time = time.time() - t0
     audio_dur = len(wav) / sr if len(wav) > 0 else 0.0
     rtf = proc_time / audio_dur if audio_dur > 0 else 0