Post-process TTS: gentle RMS boost; drop vocoder edge trim
Browse files
app.py
CHANGED
|
@@ -526,9 +526,6 @@ class BlueTTS:
|
|
| 526 |
xt, *_ = self.vector_est_ort.run(None, cond)
|
| 527 |
|
| 528 |
wav, *_ = self.vocoder_ort.run(None, {self._vocoder_input_name: xt})
|
| 529 |
-
frame_len = self.base_chunk_size * self.chunk_compress_factor
|
| 530 |
-
if wav.shape[-1] > 2 * frame_len:
|
| 531 |
-
wav = wav[..., frame_len:-frame_len]
|
| 532 |
if wav.ndim == 3 and wav.shape[1] == 1:
|
| 533 |
wav = wav[:, 0, :]
|
| 534 |
return wav, dur
|
|
@@ -608,6 +605,29 @@ def expand_numbers(text: str, lang: str = "en") -> str:
|
|
| 608 |
return text
|
| 609 |
|
| 610 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
# Cache of styles derived from uploaded reference WAVs, keyed by file hash.
|
| 612 |
_REF_WAV_CACHE: Dict[str, Style] = {}
|
| 613 |
|
|
@@ -740,7 +760,7 @@ def synthesize_text(text: str, voice: str, lang: str, steps: int, speed: float,
|
|
| 740 |
expand_numbers(text, lang=lang), lang=lang, style=style,
|
| 741 |
total_step=int(steps), speed=float(speed), cfg_scale=float(cfg_scale),
|
| 742 |
)
|
| 743 |
-
wav = np.asarray(wav).squeeze()
|
| 744 |
proc_time = time.time() - t0
|
| 745 |
audio_dur = len(wav) / sr if len(wav) > 0 else 0.0
|
| 746 |
rtf = proc_time / audio_dur if audio_dur > 0 else 0
|
|
|
|
| 526 |
xt, *_ = self.vector_est_ort.run(None, cond)
|
| 527 |
|
| 528 |
wav, *_ = self.vocoder_ort.run(None, {self._vocoder_input_name: xt})
|
|
|
|
|
|
|
|
|
|
| 529 |
if wav.ndim == 3 and wav.shape[1] == 1:
|
| 530 |
wav = wav[:, 0, :]
|
| 531 |
return wav, dur
|
|
|
|
| 605 |
return text
|
| 606 |
|
| 607 |
|
| 608 |
+
def normalize_generated_audio(wav: np.ndarray, target_rms: float = 0.08, peak_limit: float = 0.95) -> np.ndarray:
|
| 609 |
+
"""Gently lift quiet generations while leaving normal/loud audio unclipped."""
|
| 610 |
+
wav = np.asarray(wav, dtype=np.float32)
|
| 611 |
+
if wav.size == 0 or not np.isfinite(wav).all():
|
| 612 |
+
return wav
|
| 613 |
+
|
| 614 |
+
peak = float(np.max(np.abs(wav)))
|
| 615 |
+
if peak < 1e-6:
|
| 616 |
+
return wav
|
| 617 |
+
|
| 618 |
+
active = np.abs(wav) > max(peak * 0.02, 1e-4)
|
| 619 |
+
samples = wav[active] if np.any(active) else wav
|
| 620 |
+
rms = float(np.sqrt(np.mean(np.square(samples))))
|
| 621 |
+
if rms < 1e-6:
|
| 622 |
+
return wav
|
| 623 |
+
|
| 624 |
+
# Cap boost so a very quiet/bad generation does not become harsh or noisy.
|
| 625 |
+
gain = min(target_rms / rms, peak_limit / peak, 4.0)
|
| 626 |
+
if gain <= 1.0:
|
| 627 |
+
return wav
|
| 628 |
+
return (wav * gain).astype(np.float32)
|
| 629 |
+
|
| 630 |
+
|
| 631 |
# Cache of styles derived from uploaded reference WAVs, keyed by file hash.
|
| 632 |
_REF_WAV_CACHE: Dict[str, Style] = {}
|
| 633 |
|
|
|
|
| 760 |
expand_numbers(text, lang=lang), lang=lang, style=style,
|
| 761 |
total_step=int(steps), speed=float(speed), cfg_scale=float(cfg_scale),
|
| 762 |
)
|
| 763 |
+
wav = normalize_generated_audio(np.asarray(wav).squeeze())
|
| 764 |
proc_time = time.time() - t0
|
| 765 |
audio_dur = len(wav) / sr if len(wav) > 0 else 0.0
|
| 766 |
rtf = proc_time / audio_dur if audio_dur > 0 else 0
|