peterlllmm commited on
Commit
7f27076
·
verified ·
1 Parent(s): 8196470

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -10
app.py CHANGED
@@ -8,7 +8,7 @@ import io
8
  import os
9
  import soundfile as sf
10
  from nltk.tokenize import sent_tokenize
11
- from pydub import AudioSegment
12
  import gradio as gr
13
 
14
  from chatterbox.src.chatterbox.tts import ChatterboxTTS
@@ -49,10 +49,35 @@ def set_seed(seed):
49
  # ===============================
50
  # PODCAST SAFE SETTINGS
51
  # ===============================
52
- MAX_CHARS = 220 # stable for chatterbox
53
- SILENCE_MS = 350 # natural pause
54
- FADE_IN = 30
55
- FADE_OUT = 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  # ===============================
58
  # MAIN TTS FUNCTION
@@ -114,11 +139,12 @@ def generate_tts(
114
  # Generate audio per chunk
115
  # --------------------------------
116
  final_audio = AudioSegment.empty()
117
- silence = AudioSegment.silent(duration=SILENCE_MS)
118
 
119
  for i, chunk in enumerate(chunks):
120
  print(f"Generating chunk {i+1}/{len(chunks)}")
121
 
 
122
  wav = model.generate(chunk, **kwargs)
123
  wav_np = wav.squeeze(0).cpu().numpy()
124
 
@@ -127,9 +153,16 @@ def generate_tts(
127
  buffer.seek(0)
128
 
129
  segment = AudioSegment.from_wav(buffer)
130
- segment = segment.fade_in(FADE_IN).fade_out(FADE_OUT)
131
 
132
- final_audio += segment + silence
 
 
 
 
 
 
 
 
133
 
134
  # --------------------------------
135
  # Export
@@ -146,7 +179,7 @@ def generate_tts(
146
  # GRADIO UI
147
  # ===============================
148
  with gr.Blocks() as demo:
149
- gr.Markdown("## 🎙️ Storyteller / Podcast Chatterbox TTS")
150
 
151
  text = gr.Textbox(
152
  label="Story Text",
@@ -175,4 +208,4 @@ with gr.Blocks() as demo:
175
  outputs=out
176
  )
177
 
178
- demo.launch(share=True)
 
8
  import os
9
  import soundfile as sf
10
  from nltk.tokenize import sent_tokenize
11
+ from pydub import AudioSegment, silence # Added silence module
12
  import gradio as gr
13
 
14
  from chatterbox.src.chatterbox.tts import ChatterboxTTS
 
49
  # ===============================
50
  # PODCAST SAFE SETTINGS
51
  # ===============================
52
+ MAX_CHARS = 220
53
+ SILENCE_MS = 250 # Reduced slightly since we are cleaning audio
54
+ FADE_IN = 10 # Reduced fade to avoid eating words
55
+ FADE_OUT = 10 # Reduced fade to avoid weird half-breath sounds
56
+
57
+ # ===============================
58
+ # HELPER: TRIM SILENCE/BREATHS
59
+ # ===============================
60
+ def trim_audio_segment(audio_segment, silence_thresh=-40):
61
+ """
62
+ Trims silence or quiet breath sounds from the start and end of a chunk.
63
+ Adjust silence_thresh (dBFS) if it cuts off actual words.
64
+ """
65
+ # Detect non-silent chunks
66
+ non_silent_ranges = silence.detect_nonsilent(
67
+ audio_segment,
68
+ min_silence_len=100,
69
+ silence_thresh=silence_thresh
70
+ )
71
+
72
+ # If audio is completely silent or empty, return empty
73
+ if not non_silent_ranges:
74
+ return AudioSegment.empty()
75
+
76
+ # Get start of first sound and end of last sound
77
+ start_trim = non_silent_ranges[0][0]
78
+ end_trim = non_silent_ranges[-1][1]
79
+
80
+ return audio_segment[start_trim:end_trim]
81
 
82
  # ===============================
83
  # MAIN TTS FUNCTION
 
139
  # Generate audio per chunk
140
  # --------------------------------
141
  final_audio = AudioSegment.empty()
142
+ clean_pause = AudioSegment.silent(duration=SILENCE_MS)
143
 
144
  for i, chunk in enumerate(chunks):
145
  print(f"Generating chunk {i+1}/{len(chunks)}")
146
 
147
+ # 1. Generate Raw Audio
148
  wav = model.generate(chunk, **kwargs)
149
  wav_np = wav.squeeze(0).cpu().numpy()
150
 
 
153
  buffer.seek(0)
154
 
155
  segment = AudioSegment.from_wav(buffer)
 
156
 
157
+ # 2. TRIM ARTIFACTS (The Fix)
158
+ # We strip the "trailing breath" or silence from the model output
159
+ # BEFORE we add our own clean silence.
160
+ segment = trim_audio_segment(segment, silence_thresh=-45)
161
+
162
+ # 3. Apply light fade only after trimming
163
+ if len(segment) > 0:
164
+ segment = segment.fade_in(FADE_IN).fade_out(FADE_OUT)
165
+ final_audio += segment + clean_pause
166
 
167
  # --------------------------------
168
  # Export
 
179
  # GRADIO UI
180
  # ===============================
181
  with gr.Blocks() as demo:
182
+ gr.Markdown("## 🎙️ Storyteller / Podcast Chatterbox TTS (Cleaned)")
183
 
184
  text = gr.Textbox(
185
  label="Story Text",
 
208
  outputs=out
209
  )
210
 
211
+ demo.launch(share=True)