peterlllmm commited on
Commit
3ff6b5d
·
verified ·
1 Parent(s): d4020c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +291 -81
app.py CHANGED
@@ -1,211 +1,421 @@
1
  import nltk
2
- nltk.download("punkt", quiet=True)
 
 
 
3
 
4
  import random
 
5
  import numpy as np
 
6
  import torch
 
7
  import io
 
8
  import os
 
9
  import soundfile as sf
 
10
  from nltk.tokenize import sent_tokenize
11
- from pydub import AudioSegment, silence
 
 
12
  import gradio as gr
13
 
 
 
14
  from chatterbox.src.chatterbox.tts import ChatterboxTTS
15
 
 
 
16
  # ===============================
 
17
  # DEVICE
 
18
  # ===============================
 
19
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
20
- print(f"🚀 Running on device: {DEVICE}")
 
 
 
21
 
22
  # ===============================
 
23
  # LOAD MODEL ONCE
 
24
  # ===============================
 
25
  MODEL = None
26
 
 
 
27
  def get_model():
 
28
  global MODEL
 
29
  if MODEL is None:
 
30
  print("Loading Chatterbox model...")
 
31
  MODEL = ChatterboxTTS.from_pretrained(DEVICE)
32
- if hasattr(MODEL, "to") and str(MODEL.device) != DEVICE:
 
 
33
  MODEL.to(DEVICE)
34
- print("✅ Model ready.")
 
 
35
  return MODEL
36
 
 
 
37
  get_model()
38
 
 
 
39
  # ===============================
 
40
  # SEED
 
41
  # ===============================
 
42
  def set_seed(seed):
 
43
  torch.manual_seed(seed)
 
44
  if DEVICE == "cuda":
45
- torch.cuda.manual_seed(seed)
46
  torch.cuda.manual_seed_all(seed)
 
47
  random.seed(seed)
 
48
  np.random.seed(seed)
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # ===============================
51
- # UNLIMITED CHUNKING SETTINGS
 
 
52
  # ===============================
53
- MAX_CHARS = 250
54
- SILENCE_MS = 350 # 350ms breath between chunks to help slow the pacing naturally
55
- CROSSFADE_MS = 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  # ===============================
 
58
  # MAIN TTS FUNCTION
 
59
  # ===============================
 
60
  def generate_tts(
 
61
  text,
 
62
  ref_audio=None,
63
- exaggeration=0.5,
64
- temperature=0.8,
 
 
 
65
  seed=0,
66
- cfg_weight=0.5,
67
- vad_trim=True
 
68
  ):
69
 
 
 
70
  model = get_model()
71
 
 
 
72
  if seed != 0:
 
73
  set_seed(int(seed))
74
 
75
- # --------------------------------
76
- # HF Official Kwargs (Speed parameter removed to stop crash)
77
- # --------------------------------
78
- generate_kwargs = {
79
  "exaggeration": exaggeration,
 
80
  "temperature": temperature,
 
81
  "cfg_weight": cfg_weight,
 
82
  }
83
 
 
 
 
 
 
 
 
 
84
  temp_prompt = None
 
85
  if ref_audio:
 
86
  try:
 
87
  audio = AudioSegment.from_file(ref_audio)
88
-
89
- # MANUAL REF VAD TRIMMING
90
- if vad_trim:
91
- print("✂️ Sanitizing reference audio...")
92
- non_silent_ranges = silence.detect_nonsilent(audio, min_silence_len=100, silence_thresh=-45)
93
- if non_silent_ranges:
94
- start_trim = non_silent_ranges[0][0]
95
- end_trim = non_silent_ranges[-1][1]
96
- audio = audio[start_trim:end_trim]
97
 
98
  temp_prompt = "voice_prompt.wav"
 
99
  audio.export(temp_prompt, format="wav")
100
- generate_kwargs["audio_prompt_path"] = temp_prompt
101
- except Exception as e:
102
- print(f"⚠️ Reference audio failed: {e} — using default voice.")
 
 
 
 
 
103
 
104
  # --------------------------------
 
105
  # Sentence chunking
 
106
  # --------------------------------
 
107
  sentences = sent_tokenize(text)
108
 
 
 
109
  chunks = []
 
110
  current = ""
111
 
 
 
112
  for s in sentences:
 
113
  if len(current) + len(s) < MAX_CHARS:
 
114
  current += " " + s
 
115
  else:
116
- if current.strip():
117
- chunks.append(current.strip())
 
118
  current = s
119
 
 
 
120
  if current.strip():
 
121
  chunks.append(current.strip())
122
 
123
- print(f"\n📝 Total unlimited chunks: {len(chunks)}")
 
 
 
 
124
 
125
  # --------------------------------
 
126
  # Generate audio per chunk
 
127
  # --------------------------------
 
128
  final_audio = AudioSegment.empty()
 
129
  clean_pause = AudioSegment.silent(duration=SILENCE_MS)
130
 
 
 
131
  for i, chunk in enumerate(chunks):
132
- print(f"➡️ Generating chunk [{i+1}/{len(chunks)}]: {chunk[:50]}...")
133
 
134
- wav = model.generate(chunk, **generate_kwargs)
 
 
 
 
 
 
 
135
  wav_np = wav.squeeze(0).cpu().numpy()
136
 
 
 
137
  buffer = io.BytesIO()
 
138
  sf.write(buffer, wav_np, model.sr, format="WAV")
 
139
  buffer.seek(0)
140
 
 
 
141
  segment = AudioSegment.from_wav(buffer)
142
 
143
- if vad_trim:
144
- out_silent = silence.detect_nonsilent(segment, min_silence_len=100, silence_thresh=-45)
145
- if out_silent:
146
- segment = segment[:out_silent[-1][1] + 50]
147
 
148
- if len(final_audio) > 0:
149
- final_audio = final_audio.append(segment, crossfade=CROSSFADE_MS)
150
- else:
151
- final_audio = segment
152
-
153
- final_audio += clean_pause
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  # --------------------------------
 
156
  # Export
 
157
  # --------------------------------
158
- output_path = "story_voice_clean.mp3"
 
 
159
  final_audio.export(output_path, format="mp3", bitrate="192k")
160
 
 
 
161
  if temp_prompt and os.path.exists(temp_prompt):
 
162
  os.remove(temp_prompt)
163
 
164
- print(f"✅ Success! Audio saved to {output_path}")
 
165
  return output_path
166
 
 
 
167
  # ===============================
 
168
  # GRADIO UI
 
169
  # ===============================
170
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
171
- gr.Markdown("## 🎙️ Chatterbox TTS (Stable Official Backend)")
172
-
173
- with gr.Row():
174
- with gr.Column():
175
- text = gr.Textbox(
176
- label="Story Text",
177
- lines=10,
178
- placeholder="Paste your script here. To slow down the pacing, use ellipses (...) and extra commas."
179
- )
180
-
181
- ref = gr.Audio(
182
- sources=["upload", "microphone"],
183
- type="filepath",
184
- label="Reference Voice (Golden 3-5s clip is best)"
185
- )
186
-
187
- with gr.Accordion("⚙️ Engine Settings (Synced to HF Defaults)", open=True):
188
- exaggeration = gr.Slider(0.25, 2.0, value=0.5, step=0.05, label="Exaggeration (Neutral = 0.5)")
189
- cfg = gr.Slider(0.2, 1.0, value=0.5, step=0.05, label="CFG / Pace Weight")
190
- temperature = gr.Slider(0.05, 5.0, value=0.8, step=0.05, label="Temperature")
191
-
192
- with gr.Row():
193
- vad_toggle = gr.Checkbox(value=True, label="Ref VAD Trimming (Kills artifacts)")
194
- seed = gr.Number(value=0, label="Seed (0 = random)")
195
-
196
- btn = gr.Button("⚡ Generate Unlimited Voice", variant="primary")
197
-
198
- with gr.Column():
199
- out = gr.Audio(label="Final Merged Audio")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  btn.click(
 
202
  fn=generate_tts,
203
- inputs=[text, ref, exaggeration, temperature, seed, cfg, vad_toggle],
 
 
204
  outputs=out
 
205
  )
206
 
207
- print("\n" + "=" * 60)
208
- print("🔗 Launching Chatterbox Stable...")
209
- print("=" * 60 + "\n")
210
 
211
  demo.launch(share=True)
 
1
  import nltk
2
+
3
+ nltk.download("punkt")
4
+
5
+
6
 
7
  import random
8
+
9
  import numpy as np
10
+
11
  import torch
12
+
13
  import io
14
+
15
  import os
16
+
17
  import soundfile as sf
18
+
19
  from nltk.tokenize import sent_tokenize
20
+
21
+ from pydub import AudioSegment, silence # Added silence module
22
+
23
  import gradio as gr
24
 
25
+
26
+
27
  from chatterbox.src.chatterbox.tts import ChatterboxTTS
28
 
29
+
30
+
31
  # ===============================
32
+
33
  # DEVICE
34
+
35
  # ===============================
36
+
37
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
38
+
39
+ print(f"Running on: {DEVICE}")
40
+
41
+
42
 
43
  # ===============================
44
+
45
  # LOAD MODEL ONCE
46
+
47
  # ===============================
48
+
49
  MODEL = None
50
 
51
+
52
+
53
  def get_model():
54
+
55
  global MODEL
56
+
57
  if MODEL is None:
58
+
59
  print("Loading Chatterbox model...")
60
+
61
  MODEL = ChatterboxTTS.from_pretrained(DEVICE)
62
+
63
+ if hasattr(MODEL, "to"):
64
+
65
  MODEL.to(DEVICE)
66
+
67
+ print("Model ready.")
68
+
69
  return MODEL
70
 
71
+
72
+
73
  get_model()
74
 
75
+
76
+
77
  # ===============================
78
+
79
  # SEED
80
+
81
  # ===============================
82
+
83
  def set_seed(seed):
84
+
85
  torch.manual_seed(seed)
86
+
87
  if DEVICE == "cuda":
88
+
89
  torch.cuda.manual_seed_all(seed)
90
+
91
  random.seed(seed)
92
+
93
  np.random.seed(seed)
94
 
95
+
96
+
97
+ # ===============================
98
+
99
+ # PODCAST SAFE SETTINGS
100
+
101
+ # ===============================
102
+
103
+ MAX_CHARS = 220
104
+
105
+ SILENCE_MS = 250 # Reduced slightly since we are cleaning audio
106
+
107
+ FADE_IN = 10 # Reduced fade to avoid eating words
108
+
109
+ FADE_OUT = 10 # Reduced fade to avoid weird half-breath sounds
110
+
111
+
112
+
113
  # ===============================
114
+
115
+ # HELPER: TRIM SILENCE/BREATHS
116
+
117
  # ===============================
118
+
119
+ def trim_audio_segment(audio_segment, silence_thresh=-40):
120
+
121
+ """
122
+
123
+ Trims silence or quiet breath sounds from the start and end of a chunk.
124
+
125
+ Adjust silence_thresh (dBFS) if it cuts off actual words.
126
+
127
+ """
128
+
129
+ # Detect non-silent chunks
130
+
131
+ non_silent_ranges = silence.detect_nonsilent(
132
+
133
+ audio_segment,
134
+
135
+ min_silence_len=100,
136
+
137
+ silence_thresh=silence_thresh
138
+
139
+ )
140
+
141
+
142
+
143
+ # If audio is completely silent or empty, return empty
144
+
145
+ if not non_silent_ranges:
146
+
147
+ return AudioSegment.empty()
148
+
149
+
150
+
151
+ # Get start of first sound and end of last sound
152
+
153
+ start_trim = non_silent_ranges[0][0]
154
+
155
+ end_trim = non_silent_ranges[-1][1]
156
+
157
+
158
+
159
+ return audio_segment[start_trim:end_trim]
160
+
161
+
162
 
163
  # ===============================
164
+
165
  # MAIN TTS FUNCTION
166
+
167
  # ===============================
168
+
169
  def generate_tts(
170
+
171
  text,
172
+
173
  ref_audio=None,
174
+
175
+ exaggeration=0.4,
176
+
177
+ temperature=0.7,
178
+
179
  seed=0,
180
+
181
+ cfg_weight=0.6,
182
+
183
  ):
184
 
185
+
186
+
187
  model = get_model()
188
 
189
+
190
+
191
  if seed != 0:
192
+
193
  set_seed(int(seed))
194
 
195
+
196
+
197
+ kwargs = {
198
+
199
  "exaggeration": exaggeration,
200
+
201
  "temperature": temperature,
202
+
203
  "cfg_weight": cfg_weight,
204
+
205
  }
206
 
207
+
208
+
209
+ # --------------------------------
210
+
211
+ # Handle reference voice
212
+
213
+ # --------------------------------
214
+
215
  temp_prompt = None
216
+
217
  if ref_audio:
218
+
219
  try:
220
+
221
  audio = AudioSegment.from_file(ref_audio)
 
 
 
 
 
 
 
 
 
222
 
223
  temp_prompt = "voice_prompt.wav"
224
+
225
  audio.export(temp_prompt, format="wav")
226
+
227
+ kwargs["audio_prompt_path"] = temp_prompt
228
+
229
+ except:
230
+
231
+ print("Reference audio failed — using default voice.")
232
+
233
+
234
 
235
  # --------------------------------
236
+
237
  # Sentence chunking
238
+
239
  # --------------------------------
240
+
241
  sentences = sent_tokenize(text)
242
 
243
+
244
+
245
  chunks = []
246
+
247
  current = ""
248
 
249
+
250
+
251
  for s in sentences:
252
+
253
  if len(current) + len(s) < MAX_CHARS:
254
+
255
  current += " " + s
256
+
257
  else:
258
+
259
+ chunks.append(current.strip())
260
+
261
  current = s
262
 
263
+
264
+
265
  if current.strip():
266
+
267
  chunks.append(current.strip())
268
 
269
+
270
+
271
+ print(f"Total chunks: {len(chunks)}")
272
+
273
+
274
 
275
  # --------------------------------
276
+
277
  # Generate audio per chunk
278
+
279
  # --------------------------------
280
+
281
  final_audio = AudioSegment.empty()
282
+
283
  clean_pause = AudioSegment.silent(duration=SILENCE_MS)
284
 
285
+
286
+
287
  for i, chunk in enumerate(chunks):
 
288
 
289
+ print(f"Generating chunk {i+1}/{len(chunks)}")
290
+
291
+
292
+
293
+ # 1. Generate Raw Audio
294
+
295
+ wav = model.generate(chunk, **kwargs)
296
+
297
  wav_np = wav.squeeze(0).cpu().numpy()
298
 
299
+
300
+
301
  buffer = io.BytesIO()
302
+
303
  sf.write(buffer, wav_np, model.sr, format="WAV")
304
+
305
  buffer.seek(0)
306
 
307
+
308
+
309
  segment = AudioSegment.from_wav(buffer)
310
 
 
 
 
 
311
 
312
+
313
+ # 2. TRIM ARTIFACTS (The Fix)
314
+
315
+ # We strip the "trailing breath" or silence from the model output
316
+
317
+ # BEFORE we add our own clean silence.
318
+
319
+ segment = trim_audio_segment(segment, silence_thresh=-45)
320
+
321
+
322
+
323
+ # 3. Apply light fade only after trimming
324
+
325
+ if len(segment) > 0:
326
+
327
+ segment = segment.fade_in(FADE_IN).fade_out(FADE_OUT)
328
+
329
+ final_audio += segment + clean_pause
330
+
331
+
332
 
333
  # --------------------------------
334
+
335
  # Export
336
+
337
  # --------------------------------
338
+
339
+ output_path = "story_voice.mp3"
340
+
341
  final_audio.export(output_path, format="mp3", bitrate="192k")
342
 
343
+
344
+
345
  if temp_prompt and os.path.exists(temp_prompt):
346
+
347
  os.remove(temp_prompt)
348
 
349
+
350
+
351
  return output_path
352
 
353
+
354
+
355
  # ===============================
356
+
357
  # GRADIO UI
358
+
359
  # ===============================
360
+
361
+ with gr.Blocks() as demo:
362
+
363
+ gr.Markdown("## 🎙️ Storyteller / Podcast Chatterbox TTS (Cleaned)")
364
+
365
+
366
+
367
+ text = gr.Textbox(
368
+
369
+ label="Story Text",
370
+
371
+ lines=12,
372
+
373
+ placeholder="Paste your full story here..."
374
+
375
+ )
376
+
377
+
378
+
379
+ ref = gr.Audio(
380
+
381
+ sources=["upload", "microphone"],
382
+
383
+ type="filepath",
384
+
385
+ label="Reference Voice (optional)"
386
+
387
+ )
388
+
389
+
390
+
391
+ exaggeration = gr.Slider(0.25, 1.0, value=0.4, step=0.05, label="Emotion")
392
+
393
+ temperature = gr.Slider(0.3, 1.2, value=0.7, step=0.05, label="Variation")
394
+
395
+ cfg = gr.Slider(0.3, 1.0, value=0.6, step=0.05, label="Voice Stability")
396
+
397
+
398
+
399
+ seed = gr.Number(value=0, label="Seed (0 = random)")
400
+
401
+
402
+
403
+ btn = gr.Button("Generate Voice")
404
+
405
+ out = gr.Audio(label="Final Audio")
406
+
407
+
408
 
409
  btn.click(
410
+
411
  fn=generate_tts,
412
+
413
+ inputs=[text, ref, exaggeration, temperature, seed, cfg],
414
+
415
  outputs=out
416
+
417
  )
418
 
419
+
 
 
420
 
421
  demo.launch(share=True)