crackuser commited on
Commit
5280410
ยท
verified ยท
1 Parent(s): 30be8ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +283 -148
app.py CHANGED
@@ -4,82 +4,113 @@ import torchaudio
4
  import tempfile
5
  import os
6
  import logging
7
- import traceback
8
 
9
  # Setup logging
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
13
- # Device detection with proper fallback
14
  DEVICE = "cpu"
15
  if torch.cuda.is_available():
16
  DEVICE = "cuda"
17
  logger.info("๐Ÿš€ Running on CUDA GPU")
18
- elif torch.backends.mps.is_available():
19
- DEVICE = "cpu" # Force CPU for MPS compatibility
20
- logger.info("๐ŸŽ Apple Silicon detected - using CPU mode for Chatterbox-TTS compatibility")
21
  else:
22
  logger.info("๐Ÿš€ Running on CPU")
23
 
24
  print(f"๐Ÿš€ Running on device: {DEVICE}")
25
 
26
- # Patch torch.load to handle device mapping issues
27
- original_torch_load = torch.load
 
28
 
29
- def patched_torch_load(f, map_location=None, **kwargs):
30
- """Patched torch.load that automatically maps CUDA tensors to CPU/MPS"""
31
- if map_location is None:
32
- map_location = 'cpu' # Default to CPU for compatibility
33
- logger.info(f"๐Ÿ”ง Loading with map_location={map_location}")
34
- return original_torch_load(f, map_location=map_location, **kwargs)
35
-
36
- # Apply the patch
37
- torch.load = patched_torch_load
38
-
39
- # Global model variable
40
- MODEL = None
41
-
42
- def get_or_load_model():
43
- """Loads the ChatterboxTTS model with proper error handling"""
44
- global MODEL
45
- if MODEL is None:
46
- print("๐Ÿ”„ Model not loaded, initializing...")
47
  try:
48
- # Try different import paths for chatterbox
49
- try:
50
- from chatterbox import ChatterboxTTS
51
- MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
52
- print("โœ… Loaded with 'from chatterbox import ChatterboxTTS'")
53
- except ImportError:
54
- try:
55
- from chatterbox.tts import ChatterboxTTS
56
- MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
57
- print("โœ… Loaded with 'from chatterbox.tts import ChatterboxTTS'")
58
- except ImportError:
59
- try:
60
- from chatterbox.src.chatterbox.tts import ChatterboxTTS
61
- MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
62
- print("โœ… Loaded with 'from chatterbox.src.chatterbox.tts import ChatterboxTTS'")
63
- except ImportError as e:
64
- print(f"โŒ All Chatterbox import paths failed: {e}")
65
- return None
66
 
67
- # Ensure model is on correct device
68
- if hasattr(MODEL, 'to') and str(getattr(MODEL, 'device', 'unknown')) != DEVICE:
69
- MODEL = MODEL.to(DEVICE)
70
 
71
- print(f"โœ… Model loaded successfully on device: {getattr(MODEL, 'device', 'N/A')}")
72
- return MODEL
 
 
 
73
 
74
  except Exception as e:
75
- print(f"โŒ Error loading Chatterbox model: {e}")
76
- print(f"๐Ÿ” Full traceback: {traceback.format_exc()}")
77
- return None
78
 
79
- return MODEL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- def simple_voice_clone(reference_audio, input_text):
82
- """Simplified voice cloning function with better error handling"""
 
 
83
  try:
84
  if not reference_audio:
85
  return None, "โŒ Please upload reference audio!"
@@ -87,10 +118,8 @@ def simple_voice_clone(reference_audio, input_text):
87
  if not input_text or not input_text.strip():
88
  return None, "โŒ Please enter text to convert!"
89
 
90
- # Try to load model
91
- model = get_or_load_model()
92
- if model is None:
93
- return None, "โŒ Chatterbox model failed to load! Check logs for details."
94
 
95
  print(f"๐ŸŽค Generating speech with Chatterbox...")
96
  print(f"๐Ÿ“ Text: {input_text[:100]}...")
@@ -99,61 +128,60 @@ def simple_voice_clone(reference_audio, input_text):
99
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
100
  output_path = tmp_file.name
101
 
102
- # Generate speech using Chatterbox
103
- try:
 
104
  wav = model.generate(
105
  input_text,
106
  audio_prompt_path=reference_audio,
107
- exaggeration=0.5,
108
- cfg=0.5
109
  )
110
-
111
- # Save generated audio
112
- torchaudio.save(output_path, wav.cpu(), model.sr)
113
-
114
- if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
115
- return output_path, f"โœ… Chatterbox Voice Cloning Complete!\n๐Ÿ“ Generated: '{input_text[:100]}...'"
116
- else:
117
- return None, "โŒ Generated audio file is empty!"
118
-
119
- except Exception as gen_error:
120
- print(f"โŒ Generation error: {gen_error}")
121
- return None, f"โŒ Generation failed: {str(gen_error)}"
 
 
 
 
 
122
 
123
  except Exception as e:
124
- print(f"โŒ Voice cloning error: {e}")
125
- return None, f"โŒ Error: {str(e)}"
126
 
127
- # Attempt to load model at startup with better error reporting
128
  try:
129
- startup_model = get_or_load_model()
130
- if startup_model is not None:
131
- models_loaded = True
132
- startup_message = "โœ… Chatterbox Models Loaded Successfully!"
133
- else:
134
- models_loaded = False
135
- startup_message = "โŒ Failed to Load Chatterbox Models - Check Dependencies"
136
- except Exception as startup_error:
137
  models_loaded = False
138
- startup_message = f"โŒ Startup Error: {str(startup_error)}"
139
- print(f"CRITICAL: {startup_message}")
140
 
141
  # Create Gradio interface
142
  with gr.Blocks(
143
- title="๐ŸŽญ Chatterbox Voice Cloning Studio",
144
  theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
145
  ) as demo:
146
 
147
  # Header
148
  gr.HTML("""
149
  <div style="text-align: center; padding: 20px;">
150
- <h1 style="color: #8B5CF6; margin-bottom: 10px;">๐ŸŽญ Chatterbox Voice Cloning Studio</h1>
151
- <p style="color: #666; font-size: 18px;">Powered by Resemble AI's Chatterbox Model</p>
152
- <p style="color: #888; font-size: 14px;">Fixed version with proper device handling</p>
153
  </div>
154
  """)
155
 
156
- # Model Status Display
157
  status_color = "#d4edda" if models_loaded else "#f8d7da"
158
  gr.HTML(f"""
159
  <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
@@ -163,74 +191,181 @@ with gr.Blocks(
163
 
164
  with gr.Row():
165
  with gr.Column():
166
- # Reference Voice
167
- gr.HTML("<h3 style='color: #8B5CF6;'>๐ŸŽค Reference Voice</h3>")
168
  reference_audio = gr.Audio(
169
  label="Upload Reference Audio (5+ seconds)",
170
  type="filepath",
171
  sources=["upload", "microphone"]
172
  )
 
 
 
 
 
 
 
173
 
174
- # Text Input
175
- gr.HTML("<h3 style='color: #8B5CF6;'>๐Ÿ“ Text to Convert</h3>")
176
- text_input = gr.Textbox(
177
- label="Enter Text",
178
- placeholder="Enter the text you want to speak in the cloned voice...",
179
- lines=4,
180
- max_lines=8
181
- )
182
-
183
- # Generate Button
184
- generate_btn = gr.Button(
185
- "๐ŸŽค Generate Voice Clone",
186
- variant="primary",
187
- size="lg"
188
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- with gr.Column():
191
- # Output
192
- gr.HTML("<h3 style='color: #8B5CF6;'>๐ŸŽต Generated Audio</h3>")
193
- audio_output = gr.Audio(
194
- label="Cloned Voice Result",
195
- type="filepath"
196
- )
197
 
198
- status_output = gr.Textbox(
199
- label="Status & Logs",
200
- lines=6,
201
- interactive=False
202
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- # Troubleshooting Info
205
- with gr.Accordion("๐Ÿ”ง Troubleshooting", open=False):
206
  gr.Markdown("""
207
- ### Common Issues & Solutions
208
-
209
- **โŒ "Models Not Loaded" Error:**
210
- - Check that `chatterbox-tts` is installed: `pip install chatterbox-tts`
211
- - Verify internet connection for model download
212
- - Try restarting the space if models fail to load
213
-
214
- **๐Ÿ”ง Device Issues:**
215
- - This version forces CPU mode for compatibility
216
- - CUDA tensors are automatically mapped to CPU
217
- - Apple Silicon (MPS) falls back to CPU mode
218
-
219
- **๐Ÿ“ฆ Dependencies:**
220
- - Ensure all requirements are installed correctly
221
- - Check logs for specific import errors
222
- - Model downloads may take several minutes on first run
223
-
224
- **๐ŸŽค Audio Issues:**
225
- - Use clear, high-quality reference audio (5+ seconds)
226
- - Supported formats: WAV, MP3, FLAC, M4A
227
- - Avoid background noise in reference audio
228
  """)
229
 
230
- # Event handler
231
- generate_btn.click(
232
- fn=simple_voice_clone,
233
- inputs=[reference_audio, text_input],
 
 
 
 
 
 
 
234
  outputs=[audio_output, status_output],
235
  show_progress=True
236
  )
 
4
  import tempfile
5
  import os
6
  import logging
 
7
 
8
  # Setup logging
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
+ # Device detection
13
  DEVICE = "cpu"
14
  if torch.cuda.is_available():
15
  DEVICE = "cuda"
16
  logger.info("๐Ÿš€ Running on CUDA GPU")
 
 
 
17
  else:
18
  logger.info("๐Ÿš€ Running on CPU")
19
 
20
  print(f"๐Ÿš€ Running on device: {DEVICE}")
21
 
22
+ # Global models
23
+ ENGLISH_MODEL = None
24
+ MULTILINGUAL_MODEL = None
25
 
26
+ def load_chatterbox_models():
27
+ """Load Chatterbox models with proper error handling"""
28
+ global ENGLISH_MODEL, MULTILINGUAL_MODEL
29
+
30
+ if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  try:
32
+ from chatterbox.tts import ChatterboxTTS
33
+ from chatterbox.mtl_tts import ChatterboxMultilingualTTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ print("๐Ÿ”„ Loading Chatterbox English model...")
36
+ ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
37
+ print("โœ… English model loaded!")
38
 
39
+ print("๐Ÿ”„ Loading Chatterbox Multilingual model...")
40
+ MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
41
+ print("โœ… Multilingual model loaded!")
42
+
43
+ return True
44
 
45
  except Exception as e:
46
+ print(f"โŒ Error loading Chatterbox models: {e}")
47
+ return False
 
48
 
49
+ return True
50
+
51
+ def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
52
+ """
53
+ Voice-to-Voice Cloning: Transform input audio using reference voice
54
+ """
55
+ try:
56
+ if not reference_audio:
57
+ return None, "โŒ Please upload reference audio (voice to clone)!"
58
+
59
+ if not input_audio:
60
+ return None, "โŒ Please upload input audio (content to transform)!"
61
+
62
+ if not load_chatterbox_models():
63
+ return None, "โŒ Chatterbox models failed to load!"
64
+
65
+ # Extract text from input audio using Whisper (for content)
66
+ try:
67
+ import whisper
68
+ whisper_model = whisper.load_model("base")
69
+ result = whisper_model.transcribe(input_audio)
70
+ extracted_text = result["text"]
71
+ print(f"๐Ÿ“ Extracted text from input audio: {extracted_text}")
72
+ except Exception as e:
73
+ print(f"โš ๏ธ Whisper transcription failed: {e}")
74
+ extracted_text = "Voice cloning demonstration using the uploaded audio content."
75
+
76
+ # Create output file
77
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
78
+ output_path = tmp_file.name
79
+
80
+ # Use appropriate model based on language
81
+ if language == "en":
82
+ model = ENGLISH_MODEL
83
+ wav = model.generate(
84
+ extracted_text,
85
+ audio_prompt_path=reference_audio,
86
+ exaggeration=exaggeration,
87
+ cfg=cfg
88
+ )
89
+ else:
90
+ model = MULTILINGUAL_MODEL
91
+ wav = model.generate(
92
+ extracted_text,
93
+ audio_prompt_path=reference_audio,
94
+ language_id=language,
95
+ exaggeration=exaggeration,
96
+ cfg=cfg
97
+ )
98
+
99
+ # Save generated audio
100
+ torchaudio.save(output_path, wav.cpu(), model.sr)
101
+
102
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
103
+ return output_path, f"โœ… Voice-to-Voice Cloning Complete!\n๐ŸŽค Reference voice applied to: '{extracted_text[:100]}...'\n๐ŸŽ›๏ธ Settings: Exaggeration={exaggeration}, CFG={cfg}"
104
+ else:
105
+ return None, "โŒ Generated audio file is empty!"
106
+
107
+ except Exception as e:
108
+ return None, f"โŒ Voice-to-Voice cloning error: {str(e)}"
109
 
110
+ def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5, speed=1.0):
111
+ """
112
+ Text-to-Voice Cloning: Generate speech from text using reference voice
113
+ """
114
  try:
115
  if not reference_audio:
116
  return None, "โŒ Please upload reference audio!"
 
118
  if not input_text or not input_text.strip():
119
  return None, "โŒ Please enter text to convert!"
120
 
121
+ if not load_chatterbox_models():
122
+ return None, "โŒ Chatterbox models failed to load!"
 
 
123
 
124
  print(f"๐ŸŽค Generating speech with Chatterbox...")
125
  print(f"๐Ÿ“ Text: {input_text[:100]}...")
 
128
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
129
  output_path = tmp_file.name
130
 
131
+ # Use appropriate model based on language
132
+ if language == "en":
133
+ model = ENGLISH_MODEL
134
  wav = model.generate(
135
  input_text,
136
  audio_prompt_path=reference_audio,
137
+ exaggeration=exaggeration,
138
+ cfg=cfg
139
  )
140
+ else:
141
+ model = MULTILINGUAL_MODEL
142
+ wav = model.generate(
143
+ input_text,
144
+ audio_prompt_path=reference_audio,
145
+ language_id=language,
146
+ exaggeration=exaggeration,
147
+ cfg=cfg
148
+ )
149
+
150
+ # Save generated audio
151
+ torchaudio.save(output_path, wav.cpu(), model.sr)
152
+
153
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
154
+ return output_path, f"โœ… Text-to-Voice Cloning Complete!\n๐Ÿ“ Generated: '{input_text[:100]}...'\n๐ŸŽ›๏ธ Settings: Exaggeration={exaggeration}, CFG={cfg}"
155
+ else:
156
+ return None, "โŒ Generated audio file is empty!"
157
 
158
  except Exception as e:
159
+ return None, f"โŒ Text-to-Voice cloning error: {str(e)}"
 
160
 
161
+ # Try to load models at startup
162
  try:
163
+ models_loaded = load_chatterbox_models()
164
+ startup_message = "โœ… Chatterbox Models Loaded Successfully!" if models_loaded else "โŒ Failed to Load Chatterbox Models"
165
+ except Exception as e:
 
 
 
 
 
166
  models_loaded = False
167
+ startup_message = f"โŒ Startup Error: {str(e)}"
 
168
 
169
  # Create Gradio interface
170
  with gr.Blocks(
171
+ title="๐ŸŽญ Complete Chatterbox Voice Cloning Studio",
172
  theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
173
  ) as demo:
174
 
175
  # Header
176
  gr.HTML("""
177
  <div style="text-align: center; padding: 20px;">
178
+ <h1 style="color: #8B5CF6; margin-bottom: 10px;">๐ŸŽญ Complete Chatterbox Voice Cloning Studio</h1>
179
+ <p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Emotion Control</p>
180
+ <p style="color: #888; font-size: 14px;">Powered by Resemble AI's Chatterbox - The Model We Discussed!</p>
181
  </div>
182
  """)
183
 
184
+ # Model Status
185
  status_color = "#d4edda" if models_loaded else "#f8d7da"
186
  gr.HTML(f"""
187
  <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
 
191
 
192
  with gr.Row():
193
  with gr.Column():
194
+ # Reference Voice Section
195
+ gr.HTML("<h3 style='color: #8B5CF6;'>๐ŸŽค Reference Voice (Voice to Clone)</h3>")
196
  reference_audio = gr.Audio(
197
  label="Upload Reference Audio (5+ seconds)",
198
  type="filepath",
199
  sources=["upload", "microphone"]
200
  )
201
+ gr.HTML("<p style='color: #666; font-size: 14px;'>๐Ÿ“Œ This is the voice that will be cloned and applied to your content</p>")
202
+
203
+ # Tabs for different input methods
204
+ with gr.Tabs():
205
+ # Tab 1: Voice-to-Voice Cloning
206
+ with gr.TabItem("๐ŸŽต Voice-to-Voice Cloning"):
207
+ gr.HTML("<p style='margin-bottom: 15px;'>Upload audio content and transform it using the reference voice</p>")
208
 
209
+ with gr.Row():
210
+ with gr.Column():
211
+ input_audio = gr.Audio(
212
+ label="Input Audio (Content to Transform)",
213
+ type="filepath",
214
+ sources=["upload", "microphone"]
215
+ )
216
+
217
+ with gr.Row():
218
+ voice_language = gr.Dropdown(
219
+ choices=[
220
+ ("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
221
+ ("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
222
+ ("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
223
+ ("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
224
+ ("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
225
+ ("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
226
+ ("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
227
+ ("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja"),
228
+ ("๐Ÿ‡ฐ๐Ÿ‡ท Korean", "ko"),
229
+ ("๐Ÿ‡ท๐Ÿ‡บ Russian", "ru")
230
+ ],
231
+ value="en",
232
+ label="Language"
233
+ )
234
+
235
+ voice_exaggeration = gr.Slider(
236
+ minimum=0.0,
237
+ maximum=2.0,
238
+ step=0.1,
239
+ value=0.5,
240
+ label="๐ŸŽญ Emotion Exaggeration"
241
+ )
242
+
243
+ voice_cfg = gr.Slider(
244
+ minimum=0.2,
245
+ maximum=1.0,
246
+ step=0.1,
247
+ value=0.5,
248
+ label="๐ŸŽ›๏ธ CFG Scale"
249
+ )
250
+
251
+ voice_clone_btn = gr.Button(
252
+ "๐ŸŽค Transform Voice (Audio โ†’ Cloned Audio)",
253
+ variant="primary",
254
+ size="lg"
255
+ )
256
 
257
+ # Tab 2: Text-to-Voice Cloning
258
+ with gr.TabItem("๐Ÿ“ Text-to-Speech Cloning"):
259
+ gr.HTML("<p style='margin-bottom: 15px;'>Enter text and generate speech using the reference voice</p>")
 
 
 
 
260
 
261
+ with gr.Row():
262
+ with gr.Column():
263
+ text_input = gr.Textbox(
264
+ label="Text to Convert to Speech",
265
+ placeholder="Enter the text you want to speak in the cloned voice...",
266
+ lines=4,
267
+ max_lines=8
268
+ )
269
+
270
+ with gr.Row():
271
+ text_language = gr.Dropdown(
272
+ choices=[
273
+ ("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
274
+ ("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
275
+ ("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
276
+ ("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
277
+ ("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
278
+ ("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
279
+ ("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
280
+ ("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja")
281
+ ],
282
+ value="en",
283
+ label="Language"
284
+ )
285
+
286
+ text_exaggeration = gr.Slider(
287
+ minimum=0.0,
288
+ maximum=2.0,
289
+ step=0.1,
290
+ value=0.5,
291
+ label="๐ŸŽญ Emotion Exaggeration"
292
+ )
293
+
294
+ text_cfg = gr.Slider(
295
+ minimum=0.2,
296
+ maximum=1.0,
297
+ step=0.1,
298
+ value=0.5,
299
+ label="๐ŸŽ›๏ธ CFG Scale"
300
+ )
301
+
302
+ text_clone_btn = gr.Button(
303
+ "๐Ÿ“ Generate Speech (Text โ†’ Cloned Audio)",
304
+ variant="secondary",
305
+ size="lg"
306
+ )
307
+
308
+ # Output Section
309
+ gr.HTML("<h3 style='color: #8B5CF6;'>๐ŸŽต Generated Audio Output</h3>")
310
+ with gr.Row():
311
+ audio_output = gr.Audio(
312
+ label="Cloned Voice Result",
313
+ type="filepath"
314
+ )
315
+ status_output = gr.Textbox(
316
+ label="Processing Status & Details",
317
+ lines=6,
318
+ interactive=False
319
+ )
320
+
321
+ # Examples Section
322
+ with gr.Accordion("๐Ÿ’ก Example Texts for Testing", open=False):
323
+ examples = [
324
+ "Hello, this is a demonstration of real voice cloning technology using Chatterbox.",
325
+ "The weather is beautiful today, perfect for a walk in the park with friends.",
326
+ "Artificial intelligence is revolutionizing how we create and interact with digital content.",
327
+ "This advanced voice cloning system can generate natural speech in multiple languages."
328
+ ]
329
+
330
+ gr.Examples(
331
+ examples=examples,
332
+ inputs=text_input,
333
+ label="Click to try these example texts:"
334
+ )
335
 
336
+ # How It Works Section
337
+ with gr.Accordion("๐Ÿ” How Voice Cloning Works", open=False):
338
  gr.Markdown("""
339
+ ### Voice-to-Voice Cloning Process
340
+ 1. **๐ŸŽค Upload Reference Voice**: The voice you want to clone (5+ seconds)
341
+ 2. **๐Ÿ“ฅ Upload Input Audio**: Audio content you want to transform
342
+ 3. **๐Ÿง  Content Extraction**: AI extracts speech content from input audio
343
+ 4. **๐ŸŽญ Voice Application**: Reference voice characteristics applied to content
344
+ 5. **๐ŸŽต Generate Output**: New audio with original content in cloned voice
345
+
346
+ ### Text-to-Speech Process
347
+ 1. **๐ŸŽค Upload Reference Voice**: The voice you want to clone
348
+ 2. **๐Ÿ“ Enter Text**: Type the content to convert to speech
349
+ 3. **๐ŸŽ›๏ธ Adjust Controls**: Set emotion and speech parameters
350
+ 4. **๐ŸŽต Generate Speech**: Create natural speech in the cloned voice
351
+
352
+ ### Chatterbox Controls
353
+ - **Emotion Exaggeration**: 0.0 = monotone, 2.0 = very expressive
354
+ - **CFG Scale**: 0.2 = creative, 1.0 = accurate to reference
355
+ - **Language Support**: 23+ languages with multilingual model
 
 
 
 
356
  """)
357
 
358
+ # Event Handlers
359
+ voice_clone_btn.click(
360
+ fn=voice_to_voice_cloning,
361
+ inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
362
+ outputs=[audio_output, status_output],
363
+ show_progress=True
364
+ )
365
+
366
+ text_clone_btn.click(
367
+ fn=text_to_voice_cloning,
368
+ inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
369
  outputs=[audio_output, status_output],
370
  show_progress=True
371
  )