crackuser commited on
Commit
930a8ef
Β·
verified Β·
1 Parent(s): d6ad7c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -114
app.py CHANGED
@@ -19,38 +19,31 @@ else:
19
 
20
  print(f"πŸš€ Running on device: {DEVICE}")
21
 
22
- # Global models
23
  ENGLISH_MODEL = None
24
  MULTILINGUAL_MODEL = None
25
 
26
  def load_chatterbox_models():
27
- """Load Chatterbox models with proper error handling"""
28
  global ENGLISH_MODEL, MULTILINGUAL_MODEL
29
 
30
- if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
31
- try:
32
- from chatterbox.tts import ChatterboxTTS
33
- from chatterbox.mtl_tts import ChatterboxMultilingualTTS
34
-
35
- print("πŸ”„ Loading Chatterbox English model...")
36
- ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
37
- print("βœ… English model loaded!")
38
-
39
- print("πŸ”„ Loading Chatterbox Multilingual model...")
40
- MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
41
- print("βœ… Multilingual model loaded!")
42
-
43
- return True
44
-
45
- except Exception as e:
46
- print(f"❌ Error loading Chatterbox models: {e}")
47
- return False
48
-
49
- return True
50
 
51
  def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
52
  """
53
- Voice-to-Voice Cloning: Transform input audio using reference voice
 
54
  """
55
  try:
56
  if not reference_audio:
@@ -59,21 +52,27 @@ def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggera
59
  if not input_audio:
60
  return None, "❌ Please upload input audio (content to transform)!"
61
 
62
- if not load_chatterbox_models():
63
- return None, "❌ Chatterbox models failed to load!"
64
 
65
- # Extract text from input audio using Whisper (for content)
66
  try:
67
  import whisper
 
68
  whisper_model = whisper.load_model("base")
69
  result = whisper_model.transcribe(input_audio)
70
  extracted_text = result["text"]
71
- print(f"πŸ“ Extracted text from input audio: {extracted_text}")
72
  except Exception as e:
73
- print(f"⚠️ Whisper transcription failed: {e}")
74
- extracted_text = "Voice cloning demonstration using the uploaded audio content."
 
 
 
 
 
75
 
76
- # Create output file
 
77
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
78
  output_path = tmp_file.name
79
 
@@ -96,20 +95,21 @@ def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggera
96
  cfg=cfg
97
  )
98
 
99
- # Save generated audio
100
  torchaudio.save(output_path, wav.cpu(), model.sr)
101
 
102
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
103
- return output_path, f"βœ… Voice-to-Voice Cloning Complete!\n🎀 Reference voice applied to: '{extracted_text[:100]}...'\nπŸŽ›οΈ Settings: Exaggeration={exaggeration}, CFG={cfg}"
104
  else:
105
  return None, "❌ Generated audio file is empty!"
106
 
107
  except Exception as e:
108
  return None, f"❌ Voice-to-Voice cloning error: {str(e)}"
109
 
110
- def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5, speed=1.0):
111
  """
112
- Text-to-Voice Cloning: Generate speech from text using reference voice
 
113
  """
114
  try:
115
  if not reference_audio:
@@ -118,13 +118,16 @@ def text_to_voice_cloning(reference_audio, input_text, language="en", exaggerati
118
  if not input_text or not input_text.strip():
119
  return None, "❌ Please enter text to convert!"
120
 
121
- if not load_chatterbox_models():
122
- return None, "❌ Chatterbox models failed to load!"
123
 
124
- print(f"🎀 Generating speech with Chatterbox...")
125
- print(f"πŸ“ Text: {input_text[:100]}...")
 
 
126
 
127
- # Create output file
 
128
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
129
  output_path = tmp_file.name
130
 
@@ -151,60 +154,65 @@ def text_to_voice_cloning(reference_audio, input_text, language="en", exaggerati
151
  torchaudio.save(output_path, wav.cpu(), model.sr)
152
 
153
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
154
- return output_path, f"βœ… Text-to-Voice Cloning Complete!\nπŸ“ Generated: '{input_text[:100]}...'\nπŸŽ›οΈ Settings: Exaggeration={exaggeration}, CFG={cfg}"
155
  else:
156
  return None, "❌ Generated audio file is empty!"
157
 
158
  except Exception as e:
159
- return None, f"❌ Text-to-Voice cloning error: {str(e)}"
160
 
161
  # Try to load models at startup
162
  try:
163
  models_loaded = load_chatterbox_models()
164
- startup_message = "βœ… Chatterbox Models Loaded Successfully!" if models_loaded else "❌ Failed to Load Chatterbox Models"
165
  except Exception as e:
166
  models_loaded = False
167
- startup_message = f"❌ Startup Error: {str(e)}"
168
 
169
- # Create Gradio interface
170
  with gr.Blocks(
171
- title="🎭 Complete Chatterbox Voice Cloning Studio",
172
  theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
173
  ) as demo:
174
 
175
  # Header
176
  gr.HTML("""
177
  <div style="text-align: center; padding: 20px;">
178
- <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Complete Chatterbox Voice Cloning Studio</h1>
179
- <p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Emotion Control</p>
180
- <p style="color: #888; font-size: 14px;">Powered by Resemble AI's Chatterbox - The Model We Discussed!</p>
181
  </div>
182
  """)
183
 
184
  # Model Status
185
- status_color = "#d4edda" if models_loaded else "#f8d7da"
186
  gr.HTML(f"""
187
- <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
188
  <strong>πŸ€– Chatterbox Status:</strong> {startup_message}
189
  </div>
190
  """)
191
 
192
- with gr.Row():
193
- with gr.Column():
194
- # Reference Voice Section
195
- gr.HTML("<h3 style='color: #8B5CF6;'>🎀 Reference Voice (Voice to Clone)</h3>")
196
- reference_audio = gr.Audio(
197
- label="Upload Reference Audio (5+ seconds)",
198
- type="filepath",
199
- sources=["upload", "microphone"]
200
- )
201
- gr.HTML("<p style='color: #666; font-size: 14px;'>πŸ“Œ This is the voice that will be cloned and applied to your content</p>")
202
 
203
- # Tabs for different input methods
204
  with gr.Tabs():
205
- # Tab 1: Voice-to-Voice Cloning
206
  with gr.TabItem("🎡 Voice-to-Voice Cloning"):
207
- gr.HTML("<p style='margin-bottom: 15px;'>Upload audio content and transform it using the reference voice</p>")
 
 
 
 
 
 
 
 
208
 
209
  with gr.Row():
210
  with gr.Column():
@@ -229,7 +237,7 @@ with gr.Blocks(
229
  ("πŸ‡·πŸ‡Ί Russian", "ru")
230
  ],
231
  value="en",
232
- label="Language"
233
  )
234
 
235
  voice_exaggeration = gr.Slider(
@@ -241,11 +249,11 @@ with gr.Blocks(
241
  )
242
 
243
  voice_cfg = gr.Slider(
244
- minimum=0.2,
245
  maximum=1.0,
246
  step=0.1,
247
  value=0.5,
248
- label="πŸŽ›οΈ CFG Scale"
249
  )
250
 
251
  voice_clone_btn = gr.Button(
@@ -253,17 +261,37 @@ with gr.Blocks(
253
  variant="primary",
254
  size="lg"
255
  )
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- # Tab 2: Text-to-Voice Cloning
258
  with gr.TabItem("πŸ“ Text-to-Speech Cloning"):
259
- gr.HTML("<p style='margin-bottom: 15px;'>Enter text and generate speech using the reference voice</p>")
 
 
 
 
 
 
 
 
260
 
261
  with gr.Row():
262
  with gr.Column():
263
  text_input = gr.Textbox(
264
  label="Text to Convert to Speech",
265
  placeholder="Enter the text you want to speak in the cloned voice...",
266
- lines=4,
267
  max_lines=8
268
  )
269
 
@@ -280,7 +308,7 @@ with gr.Blocks(
280
  ("πŸ‡―πŸ‡΅ Japanese", "ja")
281
  ],
282
  value="en",
283
- label="Language"
284
  )
285
 
286
  text_exaggeration = gr.Slider(
@@ -292,11 +320,11 @@ with gr.Blocks(
292
  )
293
 
294
  text_cfg = gr.Slider(
295
- minimum=0.2,
296
  maximum=1.0,
297
  step=0.1,
298
  value=0.5,
299
- label="πŸŽ›οΈ CFG Scale"
300
  )
301
 
302
  text_clone_btn = gr.Button(
@@ -304,69 +332,46 @@ with gr.Blocks(
304
  variant="secondary",
305
  size="lg"
306
  )
307
-
308
- # Output Section
309
- gr.HTML("<h3 style='color: #8B5CF6;'>🎡 Generated Audio Output</h3>")
310
- with gr.Row():
311
- audio_output = gr.Audio(
312
- label="Cloned Voice Result",
313
- type="filepath"
314
- )
315
- status_output = gr.Textbox(
316
- label="Processing Status & Details",
317
- lines=6,
318
- interactive=False
319
- )
320
 
321
  # Examples Section
322
- with gr.Accordion("πŸ’‘ Example Texts for Testing", open=False):
323
  examples = [
324
- "Hello, this is a demonstration of real voice cloning technology using Chatterbox.",
325
  "The weather is beautiful today, perfect for a walk in the park with friends.",
326
- "Artificial intelligence is revolutionizing how we create and interact with digital content.",
327
  "This advanced voice cloning system can generate natural speech in multiple languages."
328
  ]
329
 
330
  gr.Examples(
331
  examples=examples,
332
  inputs=text_input,
333
- label="Click to try these example texts:"
334
  )
335
 
336
- # How It Works Section
337
- with gr.Accordion("πŸ” How Voice Cloning Works", open=False):
338
- gr.Markdown("""
339
- ### Voice-to-Voice Cloning Process
340
- 1. **🎀 Upload Reference Voice**: The voice you want to clone (5+ seconds)
341
- 2. **πŸ“₯ Upload Input Audio**: Audio content you want to transform
342
- 3. **🧠 Content Extraction**: AI extracts speech content from input audio
343
- 4. **🎭 Voice Application**: Reference voice characteristics applied to content
344
- 5. **🎡 Generate Output**: New audio with original content in cloned voice
345
-
346
- ### Text-to-Speech Process
347
- 1. **🎀 Upload Reference Voice**: The voice you want to clone
348
- 2. **πŸ“ Enter Text**: Type the content to convert to speech
349
- 3. **πŸŽ›οΈ Adjust Controls**: Set emotion and speech parameters
350
- 4. **🎡 Generate Speech**: Create natural speech in the cloned voice
351
-
352
- ### Chatterbox Controls
353
- - **Emotion Exaggeration**: 0.0 = monotone, 2.0 = very expressive
354
- - **CFG Scale**: 0.2 = creative, 1.0 = accurate to reference
355
- - **Language Support**: 23+ languages with multilingual model
356
- """)
357
-
358
- # Event Handlers
359
  voice_clone_btn.click(
360
  fn=voice_to_voice_cloning,
361
  inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
362
- outputs=[audio_output, status_output],
363
  show_progress=True
364
  )
365
 
366
  text_clone_btn.click(
367
  fn=text_to_voice_cloning,
368
  inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
369
- outputs=[audio_output, status_output],
370
  show_progress=True
371
  )
372
 
 
19
 
20
  print(f"πŸš€ Running on device: {DEVICE}")
21
 
22
+ # Global model variables
23
  ENGLISH_MODEL = None
24
  MULTILINGUAL_MODEL = None
25
 
26
  def load_chatterbox_models():
27
+ """Load Chatterbox models"""
28
  global ENGLISH_MODEL, MULTILINGUAL_MODEL
29
 
30
+ try:
31
+ from chatterbox import ChatterboxTTS
32
+ from chatterbox.tts import ChatterboxMultilingualTTS
33
+
34
+ print("πŸ”„ Loading Chatterbox models...")
35
+ ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
36
+ MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
37
+ print("βœ… Models loaded successfully!")
38
+ return True
39
+ except Exception as e:
40
+ print(f"❌ Failed to load Chatterbox models: {e}")
41
+ return False
 
 
 
 
 
 
 
 
42
 
43
  def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
44
  """
45
+ 🎀 VOICE-TO-VOICE CLONING FUNCTION
46
+ Takes input audio content and transforms it using reference voice
47
  """
48
  try:
49
  if not reference_audio:
 
52
  if not input_audio:
53
  return None, "❌ Please upload input audio (content to transform)!"
54
 
55
+ print("πŸ”„ Starting Voice-to-Voice cloning...")
 
56
 
57
+ # Step 1: Extract text from input audio using Whisper
58
  try:
59
  import whisper
60
+ print("🎀 Transcribing input audio...")
61
  whisper_model = whisper.load_model("base")
62
  result = whisper_model.transcribe(input_audio)
63
  extracted_text = result["text"]
64
+ print(f"πŸ“ Extracted text: {extracted_text}")
65
  except Exception as e:
66
+ print(f"⚠️ Whisper failed: {e}")
67
+ extracted_text = "Voice cloning demonstration using uploaded audio content."
68
+
69
+ # Step 2: Load Chatterbox models if not loaded
70
+ if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
71
+ if not load_chatterbox_models():
72
+ return None, "❌ Chatterbox models failed to load!"
73
 
74
+ # Step 3: Generate voice using Chatterbox
75
+ print("🎭 Generating cloned voice...")
76
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
77
  output_path = tmp_file.name
78
 
 
95
  cfg=cfg
96
  )
97
 
98
+ # Step 4: Save generated audio
99
  torchaudio.save(output_path, wav.cpu(), model.sr)
100
 
101
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
102
+ return output_path, f"βœ… Voice-to-Voice Cloning Complete!\n🎀 Transformed audio content: '{extracted_text[:100]}...'\nπŸŽ›οΈ Settings: Emotion={exaggeration}, CFG={cfg}\nπŸ“Š Language: {language}"
103
  else:
104
  return None, "❌ Generated audio file is empty!"
105
 
106
  except Exception as e:
107
  return None, f"❌ Voice-to-Voice cloning error: {str(e)}"
108
 
109
+ def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5):
110
  """
111
+ πŸ“ TEXT-TO-VOICE CLONING FUNCTION
112
+ Generates speech from text using reference voice
113
  """
114
  try:
115
  if not reference_audio:
 
118
  if not input_text or not input_text.strip():
119
  return None, "❌ Please enter text to convert!"
120
 
121
+ print("πŸ”„ Starting Text-to-Voice cloning...")
122
+ print(f"πŸ“ Text to convert: {input_text}")
123
 
124
+ # Load Chatterbox models if not loaded
125
+ if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
126
+ if not load_chatterbox_models():
127
+ return None, "❌ Chatterbox models failed to load!"
128
 
129
+ # Generate speech using Chatterbox
130
+ print("🎭 Generating speech...")
131
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
132
  output_path = tmp_file.name
133
 
 
154
  torchaudio.save(output_path, wav.cpu(), model.sr)
155
 
156
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
157
+ return output_path, f"βœ… Text-to-Voice Complete!\nπŸ“ Generated speech: '{input_text[:100]}...'\nπŸŽ›οΈ Settings: Emotion={exaggeration}, CFG={cfg}\nπŸ“Š Language: {language}"
158
  else:
159
  return None, "❌ Generated audio file is empty!"
160
 
161
  except Exception as e:
162
+ return None, f"❌ Text-to-Voice error: {str(e)}"
163
 
164
  # Try to load models at startup
165
  try:
166
  models_loaded = load_chatterbox_models()
167
+ startup_message = "βœ… Chatterbox Models Ready!" if models_loaded else "⚠️ Models will load on first use"
168
  except Exception as e:
169
  models_loaded = False
170
+ startup_message = f"⚠️ Model loading will be attempted on first use: {str(e)}"
171
 
172
+ # Create Gradio interface with tabs
173
  with gr.Blocks(
174
+ title="🎭 Complete Voice Cloning Studio",
175
  theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
176
  ) as demo:
177
 
178
  # Header
179
  gr.HTML("""
180
  <div style="text-align: center; padding: 20px;">
181
+ <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Complete Voice Cloning Studio</h1>
182
+ <p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Chatterbox AI</p>
183
+ <p style="color: #888; font-size: 14px;">Both functionalities included - Choose your input method below</p>
184
  </div>
185
  """)
186
 
187
  # Model Status
 
188
  gr.HTML(f"""
189
+ <div style="text-align: center; padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
190
  <strong>πŸ€– Chatterbox Status:</strong> {startup_message}
191
  </div>
192
  """)
193
 
194
+ # Reference Voice (shared across both tabs)
195
+ gr.HTML("<h3 style='color: #8B5CF6; text-align: center;'>🎀 Reference Voice (Voice to Clone)</h3>")
196
+ reference_audio = gr.Audio(
197
+ label="Upload Reference Audio (5+ seconds of clear speech)",
198
+ type="filepath",
199
+ sources=["upload", "microphone"]
200
+ )
201
+ gr.HTML("<p style='color: #666; text-align: center; margin-bottom: 20px;'>πŸ“Œ This voice will be cloned and applied to your content</p>")
 
 
202
 
203
+ # Tabs for different input methods
204
  with gr.Tabs():
205
+ # TAB 1: VOICE-TO-VOICE CLONING
206
  with gr.TabItem("🎡 Voice-to-Voice Cloning"):
207
+ gr.HTML("""
208
+ <div style="padding: 15px; background: #f0f8ff; border-radius: 10px; margin-bottom: 15px;">
209
+ <h4 style="color: #4169E1; margin-bottom: 10px;">🎀 Voice-to-Voice Process:</h4>
210
+ <p style="margin: 0;">1. Upload reference voice (person to clone)<br>
211
+ 2. Upload input audio (content to transform)<br>
212
+ 3. AI extracts speech content from input<br>
213
+ 4. Reference voice applied to extracted content</p>
214
+ </div>
215
+ """)
216
 
217
  with gr.Row():
218
  with gr.Column():
 
237
  ("πŸ‡·πŸ‡Ί Russian", "ru")
238
  ],
239
  value="en",
240
+ label="Output Language"
241
  )
242
 
243
  voice_exaggeration = gr.Slider(
 
249
  )
250
 
251
  voice_cfg = gr.Slider(
252
+ minimum=0.1,
253
  maximum=1.0,
254
  step=0.1,
255
  value=0.5,
256
+ label="πŸŽ›οΈ CFG Scale (Accuracy)"
257
  )
258
 
259
  voice_clone_btn = gr.Button(
 
261
  variant="primary",
262
  size="lg"
263
  )
264
+
265
+ with gr.Column():
266
+ voice_output_audio = gr.Audio(
267
+ label="Voice-to-Voice Result",
268
+ type="filepath"
269
+ )
270
+
271
+ voice_status = gr.Textbox(
272
+ label="Voice-to-Voice Status",
273
+ lines=6,
274
+ interactive=False
275
+ )
276
 
277
+ # TAB 2: TEXT-TO-VOICE CLONING
278
  with gr.TabItem("πŸ“ Text-to-Speech Cloning"):
279
+ gr.HTML("""
280
+ <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
281
+ <h4 style="color: #228B22; margin-bottom: 10px;">πŸ“ Text-to-Speech Process:</h4>
282
+ <p style="margin: 0;">1. Upload reference voice (person to clone)<br>
283
+ 2. Enter text to convert to speech<br>
284
+ 3. AI generates speech in cloned voice<br>
285
+ 4. Download high-quality audio result</p>
286
+ </div>
287
+ """)
288
 
289
  with gr.Row():
290
  with gr.Column():
291
  text_input = gr.Textbox(
292
  label="Text to Convert to Speech",
293
  placeholder="Enter the text you want to speak in the cloned voice...",
294
+ lines=5,
295
  max_lines=8
296
  )
297
 
 
308
  ("πŸ‡―πŸ‡΅ Japanese", "ja")
309
  ],
310
  value="en",
311
+ label="Speech Language"
312
  )
313
 
314
  text_exaggeration = gr.Slider(
 
320
  )
321
 
322
  text_cfg = gr.Slider(
323
+ minimum=0.1,
324
  maximum=1.0,
325
  step=0.1,
326
  value=0.5,
327
+ label="πŸŽ›οΈ CFG Scale (Accuracy)"
328
  )
329
 
330
  text_clone_btn = gr.Button(
 
332
  variant="secondary",
333
  size="lg"
334
  )
335
+
336
+ with gr.Column():
337
+ text_output_audio = gr.Audio(
338
+ label="Text-to-Speech Result",
339
+ type="filepath"
340
+ )
341
+
342
+ text_status = gr.Textbox(
343
+ label="Text-to-Speech Status",
344
+ lines=6,
345
+ interactive=False
346
+ )
 
347
 
348
  # Examples Section
349
+ with gr.Accordion("πŸ’‘ Example Texts", open=False):
350
  examples = [
351
+ "Hello, this is a demonstration of AI voice cloning technology using Chatterbox.",
352
  "The weather is beautiful today, perfect for a walk in the park with friends.",
353
+ "Artificial intelligence is revolutionizing the way we create and share content.",
354
  "This advanced voice cloning system can generate natural speech in multiple languages."
355
  ]
356
 
357
  gr.Examples(
358
  examples=examples,
359
  inputs=text_input,
360
+ label="Click to use these example texts:"
361
  )
362
 
363
+ # Event Handlers - BOTH FUNCTIONS CONNECTED
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  voice_clone_btn.click(
365
  fn=voice_to_voice_cloning,
366
  inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
367
+ outputs=[voice_output_audio, voice_status],
368
  show_progress=True
369
  )
370
 
371
  text_clone_btn.click(
372
  fn=text_to_voice_cloning,
373
  inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
374
+ outputs=[text_output_audio, text_status],
375
  show_progress=True
376
  )
377