dinhthuan commited on
Commit
e7666b9
·
1 Parent(s): 90d8b97

fix: add fade-out/fade-in to prevent click artifacts at end of sentences

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. requirements.txt +1 -0
  3. viterbox/tts.py +79 -3
.gitignore CHANGED
@@ -42,4 +42,5 @@ outputs/
42
  .cache/
43
 
44
  # MacOS
45
- *.DS_Store
 
 
42
  .cache/
43
 
44
  # MacOS
45
+ *.DS_Store
46
+ pretrained/
requirements.txt CHANGED
@@ -5,6 +5,7 @@ huggingface_hub>=0.20.0
5
  tokenizers>=0.15.0
6
  transformers==4.46.3
7
  librosa==0.11.0
 
8
  soundfile>=0.12.0
9
  numpy>=1.24.0,<1.26.0
10
  gradio==5.44.1
 
5
  tokenizers>=0.15.0
6
  transformers==4.46.3
7
  librosa==0.11.0
8
+ scipy>=1.10.0
9
  soundfile>=0.12.0
10
  numpy>=1.24.0,<1.26.0
11
  gradio==5.44.1
viterbox/tts.py CHANGED
@@ -87,6 +87,68 @@ def trim_silence(audio: np.ndarray, sr: int, top_db: int = 30) -> np.ndarray:
87
  return trimmed
88
 
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def crossfade_concat(audios: List[np.ndarray], sr: int, fade_ms: int = 50, pause_ms: int = 500) -> np.ndarray:
91
  """
92
  Concatenate audio segments with crossfading and optional pause between sentences.
@@ -225,7 +287,11 @@ class Viterbox:
225
 
226
  @classmethod
227
  def from_pretrained(cls, device: str = "cuda") -> 'Viterbox':
228
- """Load model from HuggingFace Hub"""
 
 
 
 
229
  ckpt_dir = Path(
230
  snapshot_download(
231
  repo_id=REPO_ID,
@@ -478,8 +544,14 @@ class Viterbox:
478
  repetition_penalty=repetition_penalty,
479
  )
480
 
481
- # Trim silence from each segment
482
- audio_np = trim_silence(audio_np, self.sr, top_db=30)
 
 
 
 
 
 
483
 
484
  if len(audio_np) > 0:
485
  audio_segments.append(audio_np)
@@ -487,6 +559,10 @@ class Viterbox:
487
  # Merge with crossfading and pause
488
  if audio_segments:
489
  merged = crossfade_concat(audio_segments, self.sr, fade_ms=crossfade_ms, pause_ms=sentence_pause_ms)
 
 
 
 
490
  return torch.from_numpy(merged).unsqueeze(0)
491
  else:
492
  return torch.zeros(1, self.sr) # 1 second of silence as fallback
 
87
  return trimmed
88
 
89
 
90
+ def apply_fade_out(audio: np.ndarray, sr: int, fade_duration: float = 0.01) -> np.ndarray:
91
+ """
92
+ Apply smooth fade-out to prevent click artifacts at the end of audio.
93
+
94
+ Args:
95
+ audio: Audio array
96
+ sr: Sample rate
97
+ fade_duration: Fade duration in seconds (default 10ms)
98
+
99
+ Returns:
100
+ Audio with fade-out applied
101
+ """
102
+ if len(audio) == 0:
103
+ return audio
104
+
105
+ fade_samples = int(fade_duration * sr)
106
+ fade_samples = min(fade_samples, len(audio)) # Don't fade more than audio length
107
+
108
+ if fade_samples <= 0:
109
+ return audio
110
+
111
+ # Create fade-out curve (linear)
112
+ fade_curve = np.linspace(1.0, 0.0, fade_samples)
113
+
114
+ # Apply fade to end of audio
115
+ audio_copy = audio.copy()
116
+ audio_copy[-fade_samples:] = audio_copy[-fade_samples:] * fade_curve
117
+
118
+ return audio_copy
119
+
120
+
121
+ def apply_fade_in(audio: np.ndarray, sr: int, fade_duration: float = 0.005) -> np.ndarray:
122
+ """
123
+ Apply smooth fade-in to prevent click artifacts at the start of audio.
124
+
125
+ Args:
126
+ audio: Audio array
127
+ sr: Sample rate
128
+ fade_duration: Fade duration in seconds (default 5ms)
129
+
130
+ Returns:
131
+ Audio with fade-in applied
132
+ """
133
+ if len(audio) == 0:
134
+ return audio
135
+
136
+ fade_samples = int(fade_duration * sr)
137
+ fade_samples = min(fade_samples, len(audio))
138
+
139
+ if fade_samples <= 0:
140
+ return audio
141
+
142
+ # Create fade-in curve (linear)
143
+ fade_curve = np.linspace(0.0, 1.0, fade_samples)
144
+
145
+ # Apply fade to start of audio
146
+ audio_copy = audio.copy()
147
+ audio_copy[:fade_samples] = audio_copy[:fade_samples] * fade_curve
148
+
149
+ return audio_copy
150
+
151
+
152
  def crossfade_concat(audios: List[np.ndarray], sr: int, fade_ms: int = 50, pause_ms: int = 500) -> np.ndarray:
153
  """
154
  Concatenate audio segments with crossfading and optional pause between sentences.
 
287
 
288
  @classmethod
289
  def from_pretrained(cls, device: str = "cuda") -> 'Viterbox':
290
+ """Load model from HuggingFace Hub to local pretrained directory"""
291
+ # Tải về thư mục pretrained/ cục bộ trong dự án
292
+ local_pretrained_dir = Path(__file__).parent.parent / "pretrained"
293
+ local_pretrained_dir.mkdir(parents=True, exist_ok=True)
294
+
295
  ckpt_dir = Path(
296
  snapshot_download(
297
  repo_id=REPO_ID,
 
544
  repetition_penalty=repetition_penalty,
545
  )
546
 
547
+ # Trim silence from each segment (use less aggressive threshold)
548
+ audio_np = trim_silence(audio_np, self.sr, top_db=20)
549
+
550
+ # Apply fade-out to prevent click at end of each segment
551
+ audio_np = apply_fade_out(audio_np, self.sr, fade_duration=0.01) # 10ms fade-out
552
+
553
+ # Apply fade-in to prevent click at start
554
+ audio_np = apply_fade_in(audio_np, self.sr, fade_duration=0.005) # 5ms fade-in
555
 
556
  if len(audio_np) > 0:
557
  audio_segments.append(audio_np)
 
559
  # Merge with crossfading and pause
560
  if audio_segments:
561
  merged = crossfade_concat(audio_segments, self.sr, fade_ms=crossfade_ms, pause_ms=sentence_pause_ms)
562
+
563
+ # Apply final fade-out to prevent click at very end
564
+ merged = apply_fade_out(merged, self.sr, fade_duration=0.015) # 15ms fade-out
565
+
566
  return torch.from_numpy(merged).unsqueeze(0)
567
  else:
568
  return torch.zeros(1, self.sr) # 1 second of silence as fallback