notmax123 commited on
Commit
12a2ba3
·
1 Parent(s): cdaa0f4

Post-process TTS: gentle RMS boost; drop vocoder edge trim

Browse files
Files changed (1) hide show
  1. app.py +24 -4
app.py CHANGED
@@ -526,9 +526,6 @@ class BlueTTS:
526
  xt, *_ = self.vector_est_ort.run(None, cond)
527
 
528
  wav, *_ = self.vocoder_ort.run(None, {self._vocoder_input_name: xt})
529
- frame_len = self.base_chunk_size * self.chunk_compress_factor
530
- if wav.shape[-1] > 2 * frame_len:
531
- wav = wav[..., frame_len:-frame_len]
532
  if wav.ndim == 3 and wav.shape[1] == 1:
533
  wav = wav[:, 0, :]
534
  return wav, dur
@@ -608,6 +605,29 @@ def expand_numbers(text: str, lang: str = "en") -> str:
608
  return text
609
 
610
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  # Cache of styles derived from uploaded reference WAVs, keyed by file hash.
612
  _REF_WAV_CACHE: Dict[str, Style] = {}
613
 
@@ -740,7 +760,7 @@ def synthesize_text(text: str, voice: str, lang: str, steps: int, speed: float,
740
  expand_numbers(text, lang=lang), lang=lang, style=style,
741
  total_step=int(steps), speed=float(speed), cfg_scale=float(cfg_scale),
742
  )
743
- wav = np.asarray(wav).squeeze()
744
  proc_time = time.time() - t0
745
  audio_dur = len(wav) / sr if len(wav) > 0 else 0.0
746
  rtf = proc_time / audio_dur if audio_dur > 0 else 0
 
526
  xt, *_ = self.vector_est_ort.run(None, cond)
527
 
528
  wav, *_ = self.vocoder_ort.run(None, {self._vocoder_input_name: xt})
 
 
 
529
  if wav.ndim == 3 and wav.shape[1] == 1:
530
  wav = wav[:, 0, :]
531
  return wav, dur
 
605
  return text
606
 
607
 
608
+ def normalize_generated_audio(wav: np.ndarray, target_rms: float = 0.08, peak_limit: float = 0.95) -> np.ndarray:
609
+ """Gently lift quiet generations while leaving normal/loud audio unclipped."""
610
+ wav = np.asarray(wav, dtype=np.float32)
611
+ if wav.size == 0 or not np.isfinite(wav).all():
612
+ return wav
613
+
614
+ peak = float(np.max(np.abs(wav)))
615
+ if peak < 1e-6:
616
+ return wav
617
+
618
+ active = np.abs(wav) > max(peak * 0.02, 1e-4)
619
+ samples = wav[active] if np.any(active) else wav
620
+ rms = float(np.sqrt(np.mean(np.square(samples))))
621
+ if rms < 1e-6:
622
+ return wav
623
+
624
+ # Cap boost so a very quiet/bad generation does not become harsh or noisy.
625
+ gain = min(target_rms / rms, peak_limit / peak, 4.0)
626
+ if gain <= 1.0:
627
+ return wav
628
+ return (wav * gain).astype(np.float32)
629
+
630
+
631
  # Cache of styles derived from uploaded reference WAVs, keyed by file hash.
632
  _REF_WAV_CACHE: Dict[str, Style] = {}
633
 
 
760
  expand_numbers(text, lang=lang), lang=lang, style=style,
761
  total_step=int(steps), speed=float(speed), cfg_scale=float(cfg_scale),
762
  )
763
+ wav = normalize_generated_audio(np.asarray(wav).squeeze())
764
  proc_time = time.time() - t0
765
  audio_dur = len(wav) / sr if len(wav) > 0 else 0.0
766
  rtf = proc_time / audio_dur if audio_dur > 0 else 0