codewithjarair commited on
Commit
d996a8f
·
verified ·
1 Parent(s): 621e6b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +205 -308
app.py CHANGED
@@ -1,339 +1,236 @@
1
- import gradio as gr
2
  import os
3
- from voice_cloning_engine import VoiceCloningEngine
 
 
 
 
 
4
  import tempfile
5
- from pathlib import Path
6
-
7
 
8
- # Initialize the voice cloning engine
9
- print("Initializing Voice Cloning Engine...")
10
- engine = VoiceCloningEngine()
11
- print("Engine ready!")
12
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- def generate_voice(
15
- text: str,
16
- reference_audio,
17
- exaggeration: float,
18
- cfg: float,
19
- seed: int,
20
- max_words_per_chunk: int,
21
- use_seed: bool,
22
- language: str
23
- ):
24
  """
25
- Generate cloned voice audio using Chatterbox
26
-
27
- Args:
28
- text: Text to synthesize
29
- reference_audio: Uploaded reference audio file
30
- exaggeration: Emotion exaggeration (0.0-1.0+)
31
- cfg: Classifier-Free Guidance weight (0.0-1.0)
32
- seed: Random seed
33
- max_words_per_chunk: Max words per chunk
34
- use_seed: Whether to use the seed value
35
- language: Language code for multilingual model
36
-
37
- Returns:
38
- Tuple of (audio_path, info_text)
39
  """
40
- try:
41
- # Validate inputs
42
- if not text or text.strip() == "":
43
- return None, "❌ Error: Please enter some text to synthesize."
44
-
45
- if reference_audio is None:
46
- return None, "❌ Error: Please upload a reference audio file."
47
-
48
- # Get reference audio path
49
- if isinstance(reference_audio, str):
50
- ref_audio_path = reference_audio
51
  else:
52
- ref_audio_path = reference_audio.name if hasattr(reference_audio, 'name') else reference_audio
53
-
54
- # Word count
55
- word_count = len(text.split())
56
-
57
- # Prepare seed
58
- actual_seed = seed if use_seed else None
59
-
60
- # Generate status message
61
- status = f"🎙️ **Generating speech with Chatterbox...**\n\n"
62
- status += f"📝 Words: {word_count}\n"
63
-
64
- if word_count > max_words_per_chunk:
65
- num_chunks = (word_count // max_words_per_chunk) + 1
66
- status += f"📦 Will be split into ~{num_chunks} chunks\n"
67
-
68
- status += f"🎭 Exaggeration: {exaggeration}\n"
69
- status += f"🎚️ CFG: {cfg}\n"
70
- status += f"🎲 Seed: {seed if use_seed else 'Random'}\n"
71
- status += f"🌍 Language: {language.upper()}\n"
72
-
73
- print(status)
74
-
75
- # Generate audio
76
- output_path = engine.generate_speech(
77
- text=text,
78
- reference_audio_path=ref_audio_path,
79
- exaggeration=exaggeration,
80
- cfg=cfg,
81
- seed=actual_seed,
82
- max_words_per_chunk=max_words_per_chunk,
83
- language=language
84
- )
85
-
86
- # Get duration
87
- duration = engine.get_audio_duration(output_path)
88
-
89
- # Success message
90
- success_msg = f"✅ **Generation Complete!**\n\n"
91
- success_msg += f"📊 Audio Duration: {duration:.2f} seconds\n"
92
- success_msg += f"📝 Words Synthesized: {word_count}\n"
93
- success_msg += f"⚡ Speed: {word_count/duration:.1f} words/second\n"
94
- success_msg += f"\n💧 *Audio includes Perth watermark for authentication*"
95
-
96
- return output_path, success_msg
97
 
98
- except Exception as e:
99
- error_msg = f"❌ **Error during generation:**\n\n{str(e)}"
100
- print(error_msg)
101
- return None, error_msg
102
-
103
-
104
- def update_seed_visibility(use_seed):
105
- """Toggle seed input visibility"""
106
- return gr.update(visible=use_seed)
107
 
 
 
 
 
 
 
 
 
 
108
 
109
- def estimate_chunks(text, max_words):
110
- """Estimate number of chunks for given text"""
111
- if not text:
112
- return "📦 Chunks: 0"
 
 
 
 
 
113
 
114
- word_count = len(text.split())
115
- if word_count <= max_words:
116
- return f"📦 Chunks: 1 (Text is within limit)"
117
- else:
118
- num_chunks = (word_count // max_words) + 1
119
- return f"📦 Estimated Chunks: {num_chunks}"
120
-
121
-
122
- # Create Gradio interface
123
- with gr.Blocks(
124
- title="🎙️ Chatterbox TTS Voice Cloning",
125
- theme=gr.themes.Soft(
126
- primary_hue="blue",
127
- secondary_hue="slate",
128
- )
129
- ) as app:
130
 
131
- gr.Markdown(
132
- """
133
- # 🎙️ Resemble AI Chatterbox Voice Cloning
134
-
135
- Clone any voice by providing a reference audio sample! Powered by **Chatterbox Turbo** - the state-of-the-art, open-source TTS model.
136
 
137
- ### 📋 Instructions:
138
- 1. **Upload** a reference audio file (10+ seconds recommended, WAV preferred)
139
- 2. **Enter** the text you want to synthesize
140
- 3. **Adjust** emotion exaggeration and CFG if needed (optional)
141
- 4. **Click** Generate to create the cloned voice
142
-
143
- ### ✨ Special Features:
144
- - 🎭 **Paralinguistic Tags**: Use [laugh], [chuckle], [cough], [sigh] in your text
145
- - 🎚️ **Emotion Control**: Adjust exaggeration from monotone to expressive
146
- - ⚡ **Auto-chunking**: Long texts automatically split for better quality
147
- - 💧 **Perth Watermark**: All outputs include imperceptible authentication watermark
148
 
149
- **Note**: Outperforms ElevenLabs in blind evaluations • MIT Licensed • Open Source
150
- """
151
- )
152
 
153
- with gr.Row():
154
- with gr.Column(scale=1):
155
- # Reference Audio Upload
156
- gr.Markdown("### 🎵 Reference Audio")
157
- reference_audio = gr.Audio(
158
- label="Upload Reference Audio",
159
- type="filepath",
160
- sources=["upload", "microphone"]
161
- )
162
 
163
- gr.Markdown(
164
- """
165
- 💡 **Tip**: Use clear audio (10-30 seconds) with minimal background noise for best results.
166
- """
 
 
 
 
167
  )
168
 
169
- # Text Input
170
- gr.Markdown("### 📝 Text to Synthesize")
171
- text_input = gr.Textbox(
172
- label="Enter Text",
173
- placeholder="Type or paste the text you want to convert to speech...",
174
- lines=8,
175
- max_lines=20
176
- )
177
 
178
- chunk_estimate = gr.Markdown("📦 Chunks: 0")
179
 
180
- with gr.Column(scale=1):
181
- # Parameters
182
- gr.Markdown("### ⚙️ Generation Parameters")
183
 
184
- with gr.Accordion("🎛️ Basic Settings", open=True):
185
- exaggeration = gr.Slider(
186
- minimum=0.0,
187
- maximum=1.5,
188
- value=0.5,
189
- step=0.1,
190
- label="Emotion Exaggeration",
191
- info="Controls emotion intensity (0.0=monotone, 0.5=natural, 1.0+=expressive)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  )
193
-
194
- cfg = gr.Slider(
195
- minimum=0.0,
196
- maximum=1.0,
197
- value=0.5,
198
- step=0.1,
199
- label="CFG (Classifier-Free Guidance)",
200
- info="Lower values for faster speech, higher for more deliberate pacing"
201
  )
202
 
203
- max_words_per_chunk = gr.Slider(
204
- minimum=50,
205
- maximum=500,
206
- value=300,
207
- step=50,
208
- label="Max Words Per Chunk",
209
- info="Texts longer than this will be auto-chunked"
210
- )
211
-
212
- with gr.Accordion("🔧 Advanced Settings", open=False):
213
- language = gr.Dropdown(
214
- choices=[
215
- "en", "es", "fr", "de", "it", "pt", "ru", "zh",
216
- "ja", "ko", "ar", "hi", "nl", "pl", "tr", "sv",
217
- "no", "da", "fi", "el", "he", "ms", "sw"
218
- ],
219
- value="en",
220
- label="Language",
221
- info="For multilingual model (English by default)"
222
- )
223
 
224
- use_seed = gr.Checkbox(
225
- label="Use Fixed Seed (for reproducibility)",
226
- value=False
227
- )
 
 
 
 
 
 
 
 
228
 
229
- seed_input = gr.Number(
230
- label="Random Seed",
231
- value=42,
232
- precision=0,
233
- visible=False
234
- )
235
-
236
- # Generate Button
237
- generate_btn = gr.Button(
238
- "🎙️ Generate Voice",
239
- variant="primary",
240
- size="lg"
241
- )
242
-
243
- # Output
244
- gr.Markdown("### 🔊 Generated Audio")
245
- output_audio = gr.Audio(
246
- label="Generated Speech",
247
- type="filepath"
248
- )
249
-
250
- output_info = gr.Markdown("")
251
-
252
- # Examples
253
- gr.Markdown("### 📚 Example Texts")
254
- gr.Examples(
255
- examples=[
256
- ["Hello! This is a test of the Chatterbox voice cloning system. I hope it sounds natural and clear."],
257
- ["Hi there [chuckle], thanks for calling! How can I help you today?"],
258
- ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."],
259
- ["Artificial intelligence has made remarkable progress in recent years, particularly in the field of natural language processing and speech synthesis."],
260
- ["Once upon a time, in a land far away, there lived a curious inventor who dreamed of creating machines that could speak with human voices."],
261
- ["Oh wow [laugh], that's amazing! I can't believe it actually works this well [chuckle]."]
262
- ],
263
- inputs=text_input,
264
- label="Click to load example text (Turbo model supports [laugh], [chuckle], [cough], etc.)"
265
- )
266
-
267
- # Event handlers
268
- use_seed.change(
269
- fn=update_seed_visibility,
270
- inputs=use_seed,
271
- outputs=seed_input
272
- )
273
-
274
- text_input.change(
275
- fn=estimate_chunks,
276
- inputs=[text_input, max_words_per_chunk],
277
- outputs=chunk_estimate
278
- )
279
-
280
- max_words_per_chunk.change(
281
- fn=estimate_chunks,
282
- inputs=[text_input, max_words_per_chunk],
283
- outputs=chunk_estimate
284
- )
285
-
286
- generate_btn.click(
287
- fn=generate_voice,
288
- inputs=[
289
- text_input,
290
- reference_audio,
291
- exaggeration,
292
- cfg,
293
- seed_input,
294
- max_words_per_chunk,
295
- use_seed,
296
- language
297
- ],
298
- outputs=[output_audio, output_info]
299
- )
300
-
301
- gr.Markdown(
302
- """
303
- ---
304
- ### ℹ️ About Chatterbox
305
-
306
- This app uses **Resemble AI's Chatterbox Turbo** - the fastest open-source TTS model. It automatically handles:
307
- - ✅ Voice cloning with just 5-30 seconds of audio
308
- - ✅ Text chunking for long inputs (auto-concatenation)
309
- - ✅ Emotion exaggeration control (unique to Chatterbox)
310
- - ✅ Paralinguistic tags: [laugh], [chuckle], [cough], [sigh]
311
- - ✅ Perth watermarking for audio authentication
312
-
313
- **Models Available**:
314
- - 🚀 **Turbo**: Fastest, supports paralinguistic tags
315
- - 🎯 **Standard**: High quality with emotion control
316
- - 🌍 **Multilingual**: 23 languages supported
317
-
318
- **Source**: [GitHub - Resemble AI Chatterbox](https://github.com/resemble-ai/chatterbox)
319
 
320
- 💡 **Tips for best results**:
321
- - Use 10-30 seconds of clear reference audio
322
- - WAV format at 24kHz+ recommended
323
- - Single speaker, minimal background noise
324
- - Try exaggeration=0.7+ for more expressive output
325
- - Lower CFG (~0.3) for faster speaking pace
326
- - Use paralinguistic tags like [chuckle] for reactions
327
 
328
- 🏆 **Consistently outperforms ElevenLabs** in blind evaluations ([Podonos testing](https://www.resemble.ai/chatterbox/))
329
- """
330
- )
331
-
332
 
333
- # Launch the app
334
  if __name__ == "__main__":
335
- app.launch(
336
- server_name="0.0.0.0",
337
- server_port=7860,
338
- share=False
339
- )
 
 
1
  import os
2
+ import random
3
+ import numpy as np
4
+ import torch
5
+ import torchaudio
6
+ import gradio as gr
7
+ import re
8
  import tempfile
9
+ from chatterbox.tts import ChatterboxTTS
 
10
 
11
+ # Set device
12
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
13
 
14
+ def set_seed(seed: int):
15
+ """Set random seed for reproducibility."""
16
+ if seed == 0:
17
+ seed = random.randint(1, 1000000)
18
+ torch.manual_seed(seed)
19
+ torch.cuda.manual_seed(seed)
20
+ torch.cuda.manual_seed_all(seed)
21
+ random.seed(seed)
22
+ np.random.seed(seed)
23
+ return seed
24
 
25
+ def split_text(text, max_chars=250):
 
 
 
 
 
 
 
 
 
26
  """
27
+ Intelligent text chunking with sentence boundary detection.
28
+ Splits text into chunks of approximately max_chars, trying to stay on sentence boundaries.
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
+ # Simple sentence boundary detection using regex
31
+ # Split by periods, question marks, and exclamation marks followed by whitespace
32
+ sentences = re.split(r'(?<=[.!?])\s+', text.strip())
33
+ chunks = []
34
+ current_chunk = ""
35
+
36
+ for sentence in sentences:
37
+ if len(current_chunk) + len(sentence) <= max_chars:
38
+ current_chunk += (sentence + " ")
 
 
39
  else:
40
+ if current_chunk:
41
+ chunks.append(current_chunk.strip())
42
+ # If a single sentence is longer than max_chars, we have to split it
43
+ if len(sentence) > max_chars:
44
+ # Further split long sentences by commas or spaces as fallback
45
+ sub_parts = re.split(r'(?<=,)\s+|\s+', sentence)
46
+ temp_chunk = ""
47
+ for part in sub_parts:
48
+ if len(temp_chunk) + len(part) <= max_chars:
49
+ temp_chunk += (part + " ")
50
+ else:
51
+ if temp_chunk:
52
+ chunks.append(temp_chunk.strip())
53
+ temp_chunk = part + " "
54
+ current_chunk = temp_chunk
55
+ else:
56
+ current_chunk = sentence + " "
57
+
58
+ if current_chunk:
59
+ chunks.append(current_chunk.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ return chunks
 
 
 
 
 
 
 
 
62
 
63
+ def load_model():
64
+ """Load the Chatterbox TTS model."""
65
+ try:
66
+ print(f"Loading Chatterbox TTS model on {DEVICE}...")
67
+ model = ChatterboxTTS.from_pretrained(DEVICE)
68
+ return model
69
+ except Exception as e:
70
+ print(f"Error loading model: {e}")
71
+ return None
72
 
73
+ def generate_tts(model, text, ref_audio, exaggeration, cfg_weight, temperature, seed, progress=gr.Progress()):
74
+ """
75
+ Generate TTS audio from text, handling long scripts via chunking.
76
+ """
77
+ if model is None:
78
+ # Try to load if not already loaded (for HF Spaces persistence)
79
+ model = load_model()
80
+ if model is None:
81
+ return None, "Error: Model could not be loaded. Check your environment/GPU."
82
 
83
+ if not text.strip():
84
+ return None, "Error: Please enter some text."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ if ref_audio is None:
87
+ return None, "Error: Please upload a reference audio file for voice cloning."
 
 
 
88
 
89
+ # Set seed
90
+ actual_seed = set_seed(int(seed))
91
+
92
+ # Chunk the text
93
+ chunks = split_text(text)
94
+ total_chunks = len(chunks)
95
+
96
+ if total_chunks == 0:
97
+ return None, "Error: No valid text to process."
 
 
98
 
99
+ all_wavs = []
 
 
100
 
101
+ try:
102
+ for i, chunk in enumerate(chunks):
103
+ progress((i / total_chunks), desc=f"Processing chunk {i+1}/{total_chunks}")
 
 
 
 
 
 
104
 
105
+ # Generate audio for this chunk
106
+ # Chatterbox.generate expects: text, audio_prompt_path, exaggeration, temperature, cfg_weight, etc.
107
+ wav = model.generate(
108
+ chunk,
109
+ audio_prompt_path=ref_audio,
110
+ exaggeration=exaggeration,
111
+ temperature=temperature,
112
+ cfg_weight=cfg_weight
113
  )
114
 
115
+ # wav is usually a torch tensor [1, T] or [T]
116
+ if wav.dim() == 1:
117
+ wav = wav.unsqueeze(0)
 
 
 
 
 
118
 
119
+ all_wavs.append(wav.cpu())
120
 
121
+ # Concatenate all audio chunks along the time dimension (last dim)
122
+ if not all_wavs:
123
+ return None, "Error: No audio was generated."
124
 
125
+ final_wav = torch.cat(all_wavs, dim=-1)
126
+
127
+ # Save to a temporary file
128
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
129
+ output_path = tmp_file.name
130
+ torchaudio.save(output_path, final_wav, model.sr)
131
+
132
+ return output_path, f"Successfully generated audio with seed {actual_seed}. Total chunks: {total_chunks}."
133
+
134
+ except Exception as e:
135
+ import traceback
136
+ traceback.print_exc()
137
+ return None, f"Error during generation: {str(e)}"
138
+
139
+ # Define the Gradio Interface
140
+ def create_ui():
141
+ # Model is loaded once and stored in state
142
+ model_state = gr.State(None)
143
+
144
+ with gr.Blocks(theme=gr.themes.Soft(), title="Chatterbox Voice Clone TTS") as demo:
145
+ gr.Markdown("# 🗣️ Voice Cloning TTS Chatterbox")
146
+ gr.Markdown("""
147
+ Clone any voice using a short reference audio clip. This application is optimized for long scripts
148
+ through intelligent sentence-based chunking and sequential processing.
149
+ """)
150
+
151
+ with gr.Row():
152
+ with gr.Column(scale=1):
153
+ text_input = gr.Textbox(
154
+ label="Script",
155
+ placeholder="Enter your long script here. The app will automatically handle chunking...",
156
+ lines=10,
157
+ value="Welcome to the Chatterbox voice cloning application. This tool allows you to generate high-quality speech from long scripts by automatically splitting them into manageable segments. Simply upload a reference audio clip of the voice you want to clone, and adjust the parameters to your liking."
158
  )
159
+ ref_audio = gr.Audio(
160
+ label="Reference Audio (Voice to Clone)",
161
+ type="filepath",
162
+ sources=["upload", "microphone"]
 
 
 
 
163
  )
164
 
165
+ with gr.Row():
166
+ exaggeration = gr.Slider(
167
+ 0.1, 1.0, value=0.5, step=0.05,
168
+ label="Exaggeration",
169
+ info="Default 0.5. Extreme values (>0.8) may be unstable."
170
+ )
171
+ cfg_weight = gr.Slider(
172
+ 0.0, 1.0, value=0.5, step=0.05,
173
+ label="CFG/Pace",
174
+ info="Control the pace and guidance scale."
175
+ )
 
 
 
 
 
 
 
 
 
176
 
177
+ with gr.Accordion("Advanced Options", open=False):
178
+ seed = gr.Number(
179
+ label="Seed",
180
+ value=0,
181
+ precision=0,
182
+ info="Set to 0 for random seed each time."
183
+ )
184
+ temperature = gr.Slider(
185
+ 0.1, 2.0, value=1.0, step=0.05,
186
+ label="Temperature",
187
+ info="Higher values increase randomness and expressiveness."
188
+ )
189
 
190
+ generate_btn = gr.Button("Generate Audio", variant="primary")
191
+
192
+ with gr.Column(scale=1):
193
+ audio_output = gr.Audio(label="Generated Speech", type="filepath")
194
+ status_msg = gr.Textbox(label="Status", interactive=False)
195
+
196
+ gr.Markdown("### 📖 Documentation")
197
+ gr.Markdown("""
198
+ ### Features
199
+ - **Voice Cloning**: Provide a clear 5-10 second reference clip.
200
+ - **Intelligent Chunking**: Scripts are split at sentence boundaries (approx. 250 chars) to ensure smooth transitions and avoid memory issues.
201
+ - **Sequential Processing**: Audio chunks are generated one-by-one and concatenated for long-form content.
202
+ - **Parameter Control**:
203
+ - **Exaggeration**: Intensity of cloned voice traits.
204
+ - **CFG/Pace**: Balance between text adherence and reference voice speed.
205
+ - **Temperature**: Randomness of the output.
206
+
207
+ ### Tips
208
+ - Use a high-quality, noise-free reference audio for best results.
209
+ - For dramatic speech, try higher **Exaggeration** and lower **CFG**.
210
+ - If the output sounds unnatural, try a different **Seed** or adjust **Temperature**.
211
+ """)
212
+
213
+ # Event handling
214
+ generate_btn.click(
215
+ fn=generate_tts,
216
+ inputs=[
217
+ model_state,
218
+ text_input,
219
+ ref_audio,
220
+ exaggeration,
221
+ cfg_weight,
222
+ temperature,
223
+ seed
224
+ ],
225
+ outputs=[audio_output, status_msg]
226
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
+ # Load model on startup
229
+ demo.load(fn=load_model, outputs=model_state)
 
 
 
 
 
230
 
231
+ return demo
 
 
 
232
 
 
233
  if __name__ == "__main__":
234
+ ui = create_ui()
235
+ # Use server_name="0.0.0.0" for deployment compatibility
236
+ ui.launch(server_name="0.0.0.0")