peterlllmm commited on
Commit
ecc440e
·
verified ·
1 Parent(s): 2492250

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -221
app.py CHANGED
@@ -1,24 +1,13 @@
1
- import random
2
  import numpy as np
3
  import torch
4
  from chatterbox.src.chatterbox.tts import ChatterboxTTS
5
  import gradio as gr
6
  import spaces
7
- import re
8
- from typing import List, Tuple
9
 
10
- # Force CPU usage and patch torch.load to handle CUDA tensors on CPU
11
- DEVICE = "cpu" # Force CPU since you don't have GPU access
12
  print(f"🚀 Running on device: {DEVICE}")
13
 
14
- # Patch torch.load to automatically map CUDA tensors to CPU
15
- original_load = torch.load
16
- def patched_load(f, map_location=None, **kwargs):
17
- if map_location is None:
18
- map_location = 'cpu' # Always map to CPU
19
- return original_load(f, map_location=map_location, **kwargs)
20
- torch.load = patched_load
21
-
22
  # --- Global Model Initialization ---
23
  MODEL = None
24
 
@@ -29,43 +18,12 @@ def get_or_load_model():
29
  if MODEL is None:
30
  print("Model not loaded, initializing...")
31
  try:
32
- # Try multiple loading strategies for CPU
33
- print("Attempting to load model on CPU...")
34
-
35
- # Strategy 1: Direct CPU loading
36
- try:
37
- MODEL = ChatterboxTTS.from_pretrained("cpu")
38
- print("✅ Model loaded successfully with direct CPU method")
39
- except Exception as e1:
40
- print(f"Direct CPU loading failed: {e1}")
41
-
42
- # Strategy 2: Try with explicit map_location if supported
43
- try:
44
- MODEL = ChatterboxTTS.from_pretrained(DEVICE, map_location='cpu')
45
- print("✅ Model loaded successfully with map_location method")
46
- except Exception as e2:
47
- print(f"map_location method failed: {e2}")
48
-
49
- # Strategy 3: Load with default then move to CPU
50
- try:
51
- MODEL = ChatterboxTTS.from_pretrained()
52
- if hasattr(MODEL, 'to'):
53
- MODEL = MODEL.to('cpu')
54
- print("✅ Model loaded successfully with default then CPU move")
55
- except Exception as e3:
56
- print(f"All loading strategies failed. Last error: {e3}")
57
- raise e3
58
-
59
- # Ensure model is on CPU
60
- if hasattr(MODEL, 'to'):
61
- MODEL = MODEL.to('cpu')
62
- if hasattr(MODEL, 'device'):
63
- print(f"Model device: {MODEL.device}")
64
-
65
- print(f"Model loaded successfully on CPU")
66
-
67
  except Exception as e:
68
- print(f"CRITICAL: All model loading attempts failed: {e}")
69
  raise
70
  return MODEL
71
 
@@ -78,149 +36,81 @@ except Exception as e:
78
  def set_seed(seed: int):
79
  """Sets the random seed for reproducibility across torch, numpy, and random."""
80
  torch.manual_seed(seed)
81
- # Remove CUDA seed setting since we're on CPU only
 
 
82
  random.seed(seed)
83
  np.random.seed(seed)
84
 
85
- def intelligent_text_chunking(text: str, max_chunk_size: int = 250) -> List[str]:
86
  """
87
- Split text into chunks intelligently, preserving sentence boundaries and meaning.
88
 
89
  Args:
90
- text (str): The input text to chunk
91
- max_chunk_size (int): Maximum characters per chunk (default 250 for safety margin)
92
-
 
93
  Returns:
94
- List[str]: List of text chunks
95
  """
96
  if len(text) <= max_chunk_size:
97
  return [text]
98
 
99
  chunks = []
 
100
 
101
- # First, split by paragraphs
102
- paragraphs = text.split('\n\n')
103
-
104
- current_chunk = ""
105
-
106
- for paragraph in paragraphs:
107
- # If the paragraph itself is too long, split by sentences
108
- if len(paragraph) > max_chunk_size:
109
- sentences = re.split(r'(?<=[.!?])\s+', paragraph)
110
-
111
- for sentence in sentences:
112
- # If even a single sentence is too long, split by clauses
113
- if len(sentence) > max_chunk_size:
114
- clauses = re.split(r'(?<=[,;:])\s+', sentence)
115
-
116
- for clause in clauses:
117
- # If clause is still too long, force split at word boundaries
118
- if len(clause) > max_chunk_size:
119
- words = clause.split()
120
- temp_chunk = ""
121
-
122
- for word in words:
123
- if len(temp_chunk + " " + word) <= max_chunk_size:
124
- temp_chunk += (" " + word) if temp_chunk else word
125
- else:
126
- if temp_chunk:
127
- chunks.append(temp_chunk.strip())
128
- temp_chunk = word
129
-
130
- if temp_chunk:
131
- if len(current_chunk + " " + temp_chunk) <= max_chunk_size:
132
- current_chunk += (" " + temp_chunk) if current_chunk else temp_chunk
133
- else:
134
- if current_chunk:
135
- chunks.append(current_chunk.strip())
136
- current_chunk = temp_chunk
137
- else:
138
- # Add clause to current chunk if it fits
139
- if len(current_chunk + " " + clause) <= max_chunk_size:
140
- current_chunk += (" " + clause) if current_chunk else clause
141
- else:
142
- if current_chunk:
143
- chunks.append(current_chunk.strip())
144
- current_chunk = clause
145
- else:
146
- # Add sentence to current chunk if it fits
147
- if len(current_chunk + " " + sentence) <= max_chunk_size:
148
- current_chunk += (" " + sentence) if current_chunk else sentence
149
- else:
150
- if current_chunk:
151
- chunks.append(current_chunk.strip())
152
- current_chunk = sentence
153
- else:
154
- # Add paragraph to current chunk if it fits
155
- if len(current_chunk + "\n\n" + paragraph) <= max_chunk_size:
156
- current_chunk += ("\n\n" + paragraph) if current_chunk else paragraph
157
  else:
158
- if current_chunk:
159
- chunks.append(current_chunk.strip())
160
- current_chunk = paragraph
161
-
162
- # Add any remaining text
163
- if current_chunk:
164
- chunks.append(current_chunk.strip())
165
-
166
- return [chunk for chunk in chunks if chunk.strip()]
167
-
168
- def concatenate_audio_chunks(audio_chunks: List[Tuple[int, np.ndarray]],
169
- silence_duration: float = 0.3) -> Tuple[int, np.ndarray]:
170
- """
171
- Concatenate multiple audio chunks with silence between them.
172
-
173
- Args:
174
- audio_chunks: List of (sample_rate, audio_array) tuples
175
- silence_duration: Duration of silence between chunks in seconds
176
-
177
- Returns:
178
- Tuple[int, np.ndarray]: Combined (sample_rate, audio_array)
179
- """
180
- if not audio_chunks:
181
- return None
182
-
183
- sample_rate = audio_chunks[0][0]
184
- silence_samples = int(sample_rate * silence_duration)
185
- silence = np.zeros(silence_samples, dtype=audio_chunks[0][1].dtype)
186
-
187
- combined_audio = []
188
- for i, (sr, audio) in enumerate(audio_chunks):
189
- combined_audio.append(audio)
190
- # Add silence between chunks (but not after the last one)
191
- if i < len(audio_chunks) - 1:
192
- combined_audio.append(silence)
193
-
194
- return sample_rate, np.concatenate(combined_audio)
195
 
196
- @spaces.GPU # This decorator might not work on CPU, but keeping it for compatibility
197
- def generate_tts_audio_chunked(
198
  text_input: str,
199
  audio_prompt_path_input: str = None,
200
  exaggeration_input: float = 0.5,
201
  temperature_input: float = 0.8,
202
  seed_num_input: int = 0,
203
  cfgw_input: float = 0.5,
204
- chunk_size: int = 250,
205
- silence_between_chunks: float = 0.3
206
  ) -> tuple[int, np.ndarray]:
207
  """
208
- Generate high-quality speech audio from text using ChatterboxTTS model with intelligent chunking.
 
209
 
210
- This tool synthesizes natural-sounding speech from input text of any length by intelligently
211
- splitting long text into chunks. When a reference audio file is provided, it captures the
212
- speaker's voice characteristics and speaking style. The generated audio maintains consistency
213
- across chunks while avoiding hallucination issues.
214
 
215
  Args:
216
- text_input (str): The text to synthesize into speech (any length)
217
- audio_prompt_path_input (str, optional): File path or URL to the reference audio file. Defaults to None.
218
- exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0). Defaults to 0.5.
219
- temperature_input (float, optional): Controls randomness in generation (0.05-5.0). Defaults to 0.8.
220
- seed_num_input (int, optional): Random seed for reproducible results (0 for random). Defaults to 0.
221
  cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
222
- chunk_size (int, optional): Maximum characters per chunk. Defaults to 250.
223
- silence_between_chunks (float, optional): Silence duration between chunks in seconds. Defaults to 0.3.
224
 
225
  Returns:
226
  tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
@@ -230,20 +120,12 @@ def generate_tts_audio_chunked(
230
  if current_model is None:
231
  raise RuntimeError("TTS model is not loaded.")
232
 
233
- if not text_input.strip():
234
- raise ValueError("Text input cannot be empty.")
235
-
236
  if seed_num_input != 0:
237
  set_seed(int(seed_num_input))
238
 
239
- print(f"Processing text of {len(text_input)} characters")
240
 
241
- # Split text into intelligent chunks
242
- text_chunks = intelligent_text_chunking(text_input, chunk_size)
243
- print(f"Split into {len(text_chunks)} chunks")
244
-
245
- # Generate audio for each chunk
246
- audio_chunks = []
247
  generate_kwargs = {
248
  "exaggeration": exaggeration_input,
249
  "temperature": temperature_input,
@@ -253,47 +135,49 @@ def generate_tts_audio_chunked(
253
  if audio_prompt_path_input:
254
  generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
255
 
 
 
 
 
 
 
 
 
256
  for i, chunk in enumerate(text_chunks):
257
- print(f"Generating audio for chunk {i+1}/{len(text_chunks)}: '{chunk[:50]}...'")
258
 
259
- try:
260
- wav = current_model.generate(chunk, **generate_kwargs)
261
- audio_chunks.append((current_model.sr, wav.squeeze(0).numpy()))
262
- except Exception as e:
263
- print(f"Error generating audio for chunk {i+1}: {e}")
264
- # Continue with remaining chunks instead of failing completely
265
- continue
266
-
267
- if not audio_chunks:
268
- raise RuntimeError("Failed to generate audio for any chunks.")
269
 
270
- # Concatenate all audio chunks
271
- print("Concatenating audio chunks...")
272
- final_sample_rate, final_audio = concatenate_audio_chunks(audio_chunks, silence_between_chunks)
 
 
273
 
274
- print(f"Audio generation complete. Total duration: {len(final_audio) / final_sample_rate:.2f} seconds")
275
- return (final_sample_rate, final_audio)
276
 
277
  with gr.Blocks() as demo:
278
  gr.Markdown(
279
  """
280
- # Chatterbox TTS Demo with Intelligent Chunking
281
- Generate high-quality speech from text of any length with reference audio styling.
282
-
283
- **Features:**
284
- - ✅ No character limit - process text of any length
285
- - ✅ Intelligent chunking preserves sentence boundaries
286
- - ✅ Consistent voice across chunks
287
- - ✅ Prevents hallucination through proper segmentation
288
  """
289
  )
290
  with gr.Row():
291
  with gr.Column():
292
  text = gr.Textbox(
293
- value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible. This is just the beginning of our culinary adventure. We're going to explore flavors that have never been combined before, creating a symphony of taste that will revolutionize the way we think about cooking.",
294
- label="Text to synthesize (any length)",
295
- max_lines=10,
296
- lines=5
297
  )
298
  ref_wav = gr.Audio(
299
  sources=["upload", "microphone"],
@@ -302,36 +186,23 @@ with gr.Blocks() as demo:
302
  value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
303
  )
304
  exaggeration = gr.Slider(
305
- 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5)", value=.5
306
  )
307
  cfg_weight = gr.Slider(
308
  0.2, 1, step=.05, label="CFG/Pace", value=0.5
309
  )
310
 
311
- with gr.Accordion("Advanced options", open=False):
312
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
313
  temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
314
- chunk_size = gr.Slider(150, 300, step=10, label="Chunk size (characters)", value=250)
315
- silence_duration = gr.Slider(0.1, 1.0, step=0.1, label="Silence between chunks (seconds)", value=0.3)
316
 
317
  run_btn = gr.Button("Generate", variant="primary")
318
 
319
  with gr.Column():
320
  audio_output = gr.Audio(label="Output Audio")
321
-
322
- with gr.Row():
323
- gr.Markdown(
324
- """
325
- **Tips:**
326
- - Longer texts are automatically split into chunks at natural boundaries (sentences, clauses)
327
- - Adjust chunk size if you notice quality issues
328
- - Increase silence duration for clearer separation between chunks
329
- - Use consistent reference audio for better voice continuity
330
- """
331
- )
332
 
333
  run_btn.click(
334
- fn=generate_tts_audio_chunked,
335
  inputs=[
336
  text,
337
  ref_wav,
@@ -339,8 +210,6 @@ with gr.Blocks() as demo:
339
  temp,
340
  seed_num,
341
  cfg_weight,
342
- chunk_size,
343
- silence_duration,
344
  ],
345
  outputs=[audio_output],
346
  )
 
1
+ import random
2
  import numpy as np
3
  import torch
4
  from chatterbox.src.chatterbox.tts import ChatterboxTTS
5
  import gradio as gr
6
  import spaces
 
 
7
 
8
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
9
  print(f"🚀 Running on device: {DEVICE}")
10
 
 
 
 
 
 
 
 
 
11
  # --- Global Model Initialization ---
12
  MODEL = None
13
 
 
18
  if MODEL is None:
19
  print("Model not loaded, initializing...")
20
  try:
21
+ MODEL = ChatterboxTTS.from_pretrained(DEVICE)
22
+ if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
23
+ MODEL.to(DEVICE)
24
+ print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  except Exception as e:
26
+ print(f"Error loading model: {e}")
27
  raise
28
  return MODEL
29
 
 
36
  def set_seed(seed: int):
37
  """Sets the random seed for reproducibility across torch, numpy, and random."""
38
  torch.manual_seed(seed)
39
+ if DEVICE == "cuda":
40
+ torch.cuda.manual_seed(seed)
41
+ torch.cuda.manual_seed_all(seed)
42
  random.seed(seed)
43
  np.random.seed(seed)
44
 
45
+ def chunk_text(text: str, max_chunk_size: int = 300, overlap: int = 50) -> list[str]:
46
  """
47
+ Split text into chunks with optional overlap for better continuity.
48
 
49
  Args:
50
+ text (str): The text to chunk
51
+ max_chunk_size (int): Maximum characters per chunk
52
+ overlap (int): Number of characters to overlap between chunks
53
+
54
  Returns:
55
+ list[str]: List of text chunks
56
  """
57
  if len(text) <= max_chunk_size:
58
  return [text]
59
 
60
  chunks = []
61
+ start = 0
62
 
63
+ while start < len(text):
64
+ end = start + max_chunk_size
65
+
66
+ # If this isn't the last chunk, try to break at a sentence or word boundary
67
+ if end < len(text):
68
+ # Look for sentence endings first
69
+ last_sentence = text.rfind('.', start, end)
70
+ if last_sentence == -1:
71
+ last_sentence = text.rfind('!', start, end)
72
+ if last_sentence == -1:
73
+ last_sentence = text.rfind('?', start, end)
74
+
75
+ # If no sentence boundary, look for word boundary
76
+ if last_sentence == -1:
77
+ last_space = text.rfind(' ', start, end)
78
+ if last_space != -1:
79
+ end = last_space
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  else:
81
+ end = last_sentence + 1
82
+
83
+ chunks.append(text[start:end].strip())
84
+ start = end - overlap if end < len(text) else end
85
+
86
+ return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ @spaces.GPU
89
+ def generate_tts_audio(
90
  text_input: str,
91
  audio_prompt_path_input: str = None,
92
  exaggeration_input: float = 0.5,
93
  temperature_input: float = 0.8,
94
  seed_num_input: int = 0,
95
  cfgw_input: float = 0.5,
96
+ chunk_size: int = 300
 
97
  ) -> tuple[int, np.ndarray]:
98
  """
99
+ Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
100
+ For long texts, automatically chunks the input for better processing.
101
 
102
+ This tool synthesizes natural-sounding speech from input text. When a reference audio file
103
+ is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
104
+ maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
 
105
 
106
  Args:
107
+ text_input (str): The text to synthesize into speech
108
+ audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
109
+ exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
110
+ temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
111
+ seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
112
  cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
113
+ chunk_size (int, optional): Maximum characters per chunk for long texts. Defaults to 300.
 
114
 
115
  Returns:
116
  tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
 
120
  if current_model is None:
121
  raise RuntimeError("TTS model is not loaded.")
122
 
 
 
 
123
  if seed_num_input != 0:
124
  set_seed(int(seed_num_input))
125
 
126
+ print(f"Generating audio for text: '{text_input[:50]}...' (Length: {len(text_input)} chars)")
127
 
128
+ # Handle optional audio prompt
 
 
 
 
 
129
  generate_kwargs = {
130
  "exaggeration": exaggeration_input,
131
  "temperature": temperature_input,
 
135
  if audio_prompt_path_input:
136
  generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
137
 
138
+ # Chunk the text if it's longer than chunk_size
139
+ text_chunks = chunk_text(text_input, chunk_size)
140
+ print(f"Processing {len(text_chunks)} chunk(s)")
141
+
142
+ # Generate audio for each chunk
143
+ audio_segments = []
144
+ sample_rate = None
145
+
146
  for i, chunk in enumerate(text_chunks):
147
+ print(f"Processing chunk {i+1}/{len(text_chunks)}: '{chunk[:30]}...'")
148
 
149
+ wav = current_model.generate(
150
+ chunk,
151
+ **generate_kwargs
152
+ )
153
+
154
+ if sample_rate is None:
155
+ sample_rate = current_model.sr
156
+
157
+ audio_segments.append(wav.squeeze(0).numpy())
 
158
 
159
+ # Concatenate all audio segments
160
+ if len(audio_segments) == 1:
161
+ final_audio = audio_segments[0]
162
+ else:
163
+ final_audio = np.concatenate(audio_segments, axis=0)
164
 
165
+ print("Audio generation complete.")
166
+ return (sample_rate, final_audio)
167
 
168
  with gr.Blocks() as demo:
169
  gr.Markdown(
170
  """
171
+ # Chatterbox TTS Demo
172
+ Generate high-quality speech from text with reference audio styling.
 
 
 
 
 
 
173
  """
174
  )
175
  with gr.Row():
176
  with gr.Column():
177
  text = gr.Textbox(
178
+ value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
179
+ label="Text to synthesize", # Removed "max chars 300" from label
180
+ max_lines=5
 
181
  )
182
  ref_wav = gr.Audio(
183
  sources=["upload", "microphone"],
 
186
  value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
187
  )
188
  exaggeration = gr.Slider(
189
+ 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
190
  )
191
  cfg_weight = gr.Slider(
192
  0.2, 1, step=.05, label="CFG/Pace", value=0.5
193
  )
194
 
195
+ with gr.Accordion("More options", open=False):
196
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
197
  temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
 
 
198
 
199
  run_btn = gr.Button("Generate", variant="primary")
200
 
201
  with gr.Column():
202
  audio_output = gr.Audio(label="Output Audio")
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  run_btn.click(
205
+ fn=generate_tts_audio,
206
  inputs=[
207
  text,
208
  ref_wav,
 
210
  temp,
211
  seed_num,
212
  cfg_weight,
 
 
213
  ],
214
  outputs=[audio_output],
215
  )