crackuser commited on
Commit
6465ea7
ยท
verified ยท
1 Parent(s): 85f91ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -246
app.py CHANGED
@@ -3,47 +3,49 @@ import torch
3
  import torchaudio
4
  import tempfile
5
  import os
6
- import logging
7
-
8
- # Setup logging
9
- logging.basicConfig(level=logging.INFO)
10
- logger = logging.getLogger(__name__)
11
 
12
  # Device detection
13
- DEVICE = "cpu"
14
- if torch.cuda.is_available():
15
- DEVICE = "cuda"
16
- logger.info("๐Ÿš€ Running on CUDA GPU")
17
- else:
18
- logger.info("๐Ÿš€ Running on CPU")
19
-
20
- print(f"๐Ÿš€ Running on device: {DEVICE}")
21
 
22
- # Global model variables
23
- ENGLISH_MODEL = None
24
- MULTILINGUAL_MODEL = None
25
 
26
- def load_chatterbox_models():
27
- """Load Chatterbox models"""
28
- global ENGLISH_MODEL, MULTILINGUAL_MODEL
29
 
30
- try:
31
- from chatterbox import ChatterboxTTS
32
- from chatterbox.tts import ChatterboxMultilingualTTS
33
-
34
- print("๐Ÿ”„ Loading Chatterbox models...")
35
- ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
36
- MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
37
- print("โœ… Models loaded successfully!")
38
- return True
39
- except Exception as e:
40
- print(f"โŒ Failed to load Chatterbox models: {e}")
41
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
44
  """
45
- ๐ŸŽค VOICE-TO-VOICE CLONING FUNCTION
46
- Takes input audio content and transforms it using reference voice
47
  """
48
  try:
49
  if not reference_audio:
@@ -52,64 +54,50 @@ def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggera
52
  if not input_audio:
53
  return None, "โŒ Please upload input audio (content to transform)!"
54
 
55
- print("๐Ÿ”„ Starting Voice-to-Voice cloning...")
 
 
 
 
56
 
57
  # Step 1: Extract text from input audio using Whisper
58
- try:
59
- import whisper
60
- print("๐ŸŽค Transcribing input audio...")
61
- whisper_model = whisper.load_model("base")
62
- result = whisper_model.transcribe(input_audio)
63
  extracted_text = result["text"]
64
- print(f"๐Ÿ“ Extracted text: {extracted_text}")
65
- except Exception as e:
66
- print(f"โš ๏ธ Whisper failed: {e}")
67
  extracted_text = "Voice cloning demonstration using uploaded audio content."
 
68
 
69
- # Step 2: Load Chatterbox models if not loaded
70
- if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
71
- if not load_chatterbox_models():
72
- return None, "โŒ Chatterbox models failed to load!"
73
 
74
- # Step 3: Generate voice using Chatterbox
75
- print("๐ŸŽญ Generating cloned voice...")
76
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
77
  output_path = tmp_file.name
78
 
79
- # Use appropriate model based on language
80
- if language == "en":
81
- model = ENGLISH_MODEL
82
- wav = model.generate(
83
- extracted_text,
84
- audio_prompt_path=reference_audio,
85
- exaggeration=exaggeration,
86
- cfg=cfg
87
- )
88
- else:
89
- model = MULTILINGUAL_MODEL
90
- wav = model.generate(
91
- extracted_text,
92
- audio_prompt_path=reference_audio,
93
- language_id=language,
94
- exaggeration=exaggeration,
95
- cfg=cfg
96
- )
97
-
98
- # Step 4: Save generated audio
99
- torchaudio.save(output_path, wav.cpu(), model.sr)
100
 
 
101
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
102
- return output_path, f"โœ… Voice-to-Voice Cloning Complete!\n๐ŸŽค Transformed audio content: '{extracted_text[:100]}...'\n๐ŸŽ›๏ธ Settings: Emotion={exaggeration}, CFG={cfg}\n๐Ÿ“Š Language: {language}"
103
  else:
104
  return None, "โŒ Generated audio file is empty!"
105
 
106
  except Exception as e:
107
- return None, f"โŒ Voice-to-Voice cloning error: {str(e)}"
 
 
108
 
109
- def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5):
110
  """
111
- ๐Ÿ“ TEXT-TO-VOICE CLONING FUNCTION
112
- Generates speech from text using reference voice
113
  """
114
  try:
115
  if not reference_audio:
@@ -118,99 +106,79 @@ def text_to_voice_cloning(reference_audio, input_text, language="en", exaggerati
118
  if not input_text or not input_text.strip():
119
  return None, "โŒ Please enter text to convert!"
120
 
121
- print("๐Ÿ”„ Starting Text-to-Voice cloning...")
122
- print(f"๐Ÿ“ Text to convert: {input_text}")
 
123
 
124
- # Load Chatterbox models if not loaded
125
- if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
126
- if not load_chatterbox_models():
127
- return None, "โŒ Chatterbox models failed to load!"
128
 
129
- # Generate speech using Chatterbox
130
- print("๐ŸŽญ Generating speech...")
131
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
132
  output_path = tmp_file.name
133
 
134
- # Use appropriate model based on language
135
- if language == "en":
136
- model = ENGLISH_MODEL
137
- wav = model.generate(
138
- input_text,
139
- audio_prompt_path=reference_audio,
140
- exaggeration=exaggeration,
141
- cfg=cfg
142
- )
143
- else:
144
- model = MULTILINGUAL_MODEL
145
- wav = model.generate(
146
- input_text,
147
- audio_prompt_path=reference_audio,
148
- language_id=language,
149
- exaggeration=exaggeration,
150
- cfg=cfg
151
- )
152
-
153
- # Save generated audio
154
- torchaudio.save(output_path, wav.cpu(), model.sr)
155
 
 
156
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
157
- return output_path, f"โœ… Text-to-Voice Complete!\n๐Ÿ“ Generated speech: '{input_text[:100]}...'\n๐ŸŽ›๏ธ Settings: Emotion={exaggeration}, CFG={cfg}\n๐Ÿ“Š Language: {language}"
158
  else:
159
  return None, "โŒ Generated audio file is empty!"
160
 
161
  except Exception as e:
162
- return None, f"โŒ Text-to-Voice error: {str(e)}"
 
 
163
 
164
- # Try to load models at startup
165
- try:
166
- models_loaded = load_chatterbox_models()
167
- startup_message = "โœ… Chatterbox Models Ready!" if models_loaded else "โš ๏ธ Models will load on first use"
168
- except Exception as e:
169
- models_loaded = False
170
- startup_message = f"โš ๏ธ Model loading will be attempted on first use: {str(e)}"
171
 
172
- # Create Gradio interface with tabs
173
  with gr.Blocks(
174
- title="๐ŸŽญ Complete Voice Cloning Studio",
175
- theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
176
  ) as demo:
177
 
178
- # Header
179
  gr.HTML("""
180
  <div style="text-align: center; padding: 20px;">
181
- <h1 style="color: #8B5CF6; margin-bottom: 10px;">๐ŸŽญ Complete Voice Cloning Studio</h1>
182
- <p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Chatterbox AI</p>
183
- <p style="color: #888; font-size: 14px;">Both functionalities included - Choose your input method below</p>
184
  </div>
185
  """)
186
 
187
- # Model Status
 
188
  gr.HTML(f"""
189
- <div style="text-align: center; padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
190
- <strong>๐Ÿค– Chatterbox Status:</strong> {startup_message}
191
  </div>
192
  """)
193
 
194
- # Reference Voice (shared across both tabs)
195
- gr.HTML("<h3 style='color: #8B5CF6; text-align: center;'>๐ŸŽค Reference Voice (Voice to Clone)</h3>")
196
  reference_audio = gr.Audio(
197
- label="Upload Reference Audio (5+ seconds of clear speech)",
198
  type="filepath",
199
  sources=["upload", "microphone"]
200
  )
201
- gr.HTML("<p style='color: #666; text-align: center; margin-bottom: 20px;'>๐Ÿ“Œ This voice will be cloned and applied to your content</p>")
202
 
203
- # Tabs for different input methods
204
  with gr.Tabs():
205
- # TAB 1: VOICE-TO-VOICE CLONING
206
  with gr.TabItem("๐ŸŽต Voice-to-Voice Cloning"):
207
  gr.HTML("""
208
- <div style="padding: 15px; background: #f0f8ff; border-radius: 10px; margin-bottom: 15px;">
209
- <h4 style="color: #4169E1; margin-bottom: 10px;">๐ŸŽค Voice-to-Voice Process:</h4>
210
- <p style="margin: 0;">1. Upload reference voice (person to clone)<br>
211
- 2. Upload input audio (content to transform)<br>
212
- 3. AI extracts speech content from input<br>
213
- 4. Reference voice applied to extracted content</p>
214
  </div>
215
  """)
216
 
@@ -222,162 +190,109 @@ with gr.Blocks(
222
  sources=["upload", "microphone"]
223
  )
224
 
225
- with gr.Row():
226
- voice_language = gr.Dropdown(
227
- choices=[
228
- ("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
229
- ("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
230
- ("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
231
- ("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
232
- ("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
233
- ("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
234
- ("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
235
- ("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja"),
236
- ("๐Ÿ‡ฐ๐Ÿ‡ท Korean", "ko"),
237
- ("๐Ÿ‡ท๐Ÿ‡บ Russian", "ru")
238
- ],
239
- value="en",
240
- label="Output Language"
241
- )
242
-
243
- voice_exaggeration = gr.Slider(
244
- minimum=0.0,
245
- maximum=2.0,
246
- step=0.1,
247
- value=0.5,
248
- label="๐ŸŽญ Emotion Exaggeration"
249
- )
250
-
251
- voice_cfg = gr.Slider(
252
- minimum=0.1,
253
- maximum=1.0,
254
- step=0.1,
255
- value=0.5,
256
- label="๐ŸŽ›๏ธ CFG Scale (Accuracy)"
257
- )
258
 
259
- voice_clone_btn = gr.Button(
260
  "๐ŸŽค Transform Voice (Audio โ†’ Cloned Audio)",
261
  variant="primary",
262
  size="lg"
263
  )
264
 
265
  with gr.Column():
266
- voice_output_audio = gr.Audio(
267
- label="Voice-to-Voice Result",
268
- type="filepath"
269
- )
270
-
271
  voice_status = gr.Textbox(
272
  label="Voice-to-Voice Status",
273
  lines=6,
274
  interactive=False
275
  )
276
 
277
- # TAB 2: TEXT-TO-VOICE CLONING
278
  with gr.TabItem("๐Ÿ“ Text-to-Speech Cloning"):
279
  gr.HTML("""
280
  <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
281
- <h4 style="color: #228B22; margin-bottom: 10px;">๐Ÿ“ Text-to-Speech Process:</h4>
282
- <p style="margin: 0;">1. Upload reference voice (person to clone)<br>
283
- 2. Enter text to convert to speech<br>
284
- 3. AI generates speech in cloned voice<br>
285
- 4. Download high-quality audio result</p>
286
  </div>
287
  """)
288
 
289
  with gr.Row():
290
  with gr.Column():
291
  text_input = gr.Textbox(
292
- label="Text to Convert to Speech",
293
- placeholder="Enter the text you want to speak in the cloned voice...",
294
- lines=5,
295
- max_lines=8
296
  )
297
 
298
- with gr.Row():
299
- text_language = gr.Dropdown(
300
- choices=[
301
- ("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
302
- ("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
303
- ("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
304
- ("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
305
- ("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
306
- ("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
307
- ("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
308
- ("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja")
309
- ],
310
- value="en",
311
- label="Speech Language"
312
- )
313
-
314
- text_exaggeration = gr.Slider(
315
- minimum=0.0,
316
- maximum=2.0,
317
- step=0.1,
318
- value=0.5,
319
- label="๐ŸŽญ Emotion Exaggeration"
320
- )
321
-
322
- text_cfg = gr.Slider(
323
- minimum=0.1,
324
- maximum=1.0,
325
- step=0.1,
326
- value=0.5,
327
- label="๐ŸŽ›๏ธ CFG Scale (Accuracy)"
328
- )
329
 
330
- text_clone_btn = gr.Button(
331
  "๐Ÿ“ Generate Speech (Text โ†’ Cloned Audio)",
332
  variant="secondary",
333
  size="lg"
334
  )
335
 
336
  with gr.Column():
337
- text_output_audio = gr.Audio(
338
- label="Text-to-Speech Result",
339
- type="filepath"
340
- )
341
-
342
  text_status = gr.Textbox(
343
  label="Text-to-Speech Status",
344
  lines=6,
345
  interactive=False
346
  )
347
 
348
- # Examples Section
349
  with gr.Accordion("๐Ÿ’ก Example Texts", open=False):
350
  examples = [
351
- "Hello, this is a demonstration of AI voice cloning technology using Chatterbox.",
352
- "The weather is beautiful today, perfect for a walk in the park with friends.",
353
- "Artificial intelligence is revolutionizing the way we create and share content.",
354
- "This advanced voice cloning system can generate natural speech in multiple languages."
355
  ]
356
-
357
- gr.Examples(
358
- examples=examples,
359
- inputs=text_input,
360
- label="Click to use these example texts:"
361
- )
362
 
363
- # Event Handlers - BOTH FUNCTIONS CONNECTED
364
- voice_clone_btn.click(
365
- fn=voice_to_voice_cloning,
366
- inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
367
- outputs=[voice_output_audio, voice_status],
368
  show_progress=True
369
  )
370
 
371
- text_clone_btn.click(
372
- fn=text_to_voice_cloning,
373
- inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
374
- outputs=[text_output_audio, text_status],
375
  show_progress=True
376
  )
377
 
378
  if __name__ == "__main__":
379
- demo.launch(
380
- server_name="0.0.0.0",
381
- server_port=7860,
382
- share=False
383
- )
 
3
  import torchaudio
4
  import tempfile
5
  import os
 
 
 
 
 
6
 
7
  # Device detection
8
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
9
+ print(f"๐Ÿš€ Using device: {DEVICE}")
 
 
 
 
 
 
10
 
11
+ # Global models
12
+ TTS_MODEL = None
13
+ WHISPER_MODEL = None
14
 
15
+ def load_models():
16
+ """Load TTS models with proper error handling"""
17
+ global TTS_MODEL, WHISPER_MODEL
18
 
19
+ print("๐Ÿ”„ Loading models...")
20
+
21
+ # Load XTTS-v2 (most reliable for voice cloning)
22
+ if TTS_MODEL is None:
23
+ try:
24
+ from TTS.api import TTS
25
+ os.environ["COQUI_TOS_AGREED"] = "1"
26
+ print("๐Ÿ“ฆ Loading XTTS-v2...")
27
+ TTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE)
28
+ print("โœ… XTTS-v2 loaded successfully!")
29
+ except Exception as e:
30
+ print(f"โŒ XTTS-v2 failed: {e}")
31
+ return False
32
+
33
+ # Load Whisper for voice-to-voice
34
+ if WHISPER_MODEL is None:
35
+ try:
36
+ import whisper
37
+ print("๐Ÿ“ฆ Loading Whisper...")
38
+ WHISPER_MODEL = whisper.load_model("base")
39
+ print("โœ… Whisper loaded successfully!")
40
+ except Exception as e:
41
+ print(f"โŒ Whisper failed: {e}")
42
+
43
+ return TTS_MODEL is not None
44
 
45
+ def voice_to_voice_clone(reference_audio, input_audio, language="en"):
46
  """
47
+ ๐ŸŽค VOICE-TO-VOICE CLONING - Real Implementation
48
+ Transform input audio content using reference voice characteristics
49
  """
50
  try:
51
  if not reference_audio:
 
54
  if not input_audio:
55
  return None, "โŒ Please upload input audio (content to transform)!"
56
 
57
+ # Load models
58
+ if not load_models():
59
+ return None, "โŒ XTTS-v2 model failed to load!"
60
+
61
+ print("๐ŸŽค Starting Voice-to-Voice Cloning...")
62
 
63
  # Step 1: Extract text from input audio using Whisper
64
+ if WHISPER_MODEL:
65
+ print("๐Ÿ“ Transcribing input audio...")
66
+ result = WHISPER_MODEL.transcribe(input_audio)
 
 
67
  extracted_text = result["text"]
68
+ print(f"โœ… Extracted: {extracted_text[:100]}...")
69
+ else:
 
70
  extracted_text = "Voice cloning demonstration using uploaded audio content."
71
+ print("โš ๏ธ Using fallback text")
72
 
73
+ # Step 2: Generate new audio with reference voice using XTTS-v2
74
+ print("๐ŸŽญ Generating speech with cloned voice...")
 
 
75
 
 
 
76
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
77
  output_path = tmp_file.name
78
 
79
+ # Use XTTS-v2 for voice cloning
80
+ TTS_MODEL.tts_to_file(
81
+ text=extracted_text,
82
+ speaker_wav=reference_audio,
83
+ language=language,
84
+ file_path=output_path
85
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # Verify output
88
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
89
+ return output_path, f"โœ… Voice-to-Voice Cloning Complete!\n๐ŸŽค Original content: '{extracted_text[:100]}...'\n๐ŸŽญ Applied reference voice characteristics\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: XTTS-v2"
90
  else:
91
  return None, "โŒ Generated audio file is empty!"
92
 
93
  except Exception as e:
94
+ error_msg = f"โŒ Voice-to-Voice Error: {str(e)}"
95
+ print(error_msg)
96
+ return None, error_msg
97
 
98
+ def text_to_voice_clone(reference_audio, input_text, language="en"):
99
  """
100
+ ๐Ÿ“ TEXT-TO-VOICE CLONING - Real Implementation
 
101
  """
102
  try:
103
  if not reference_audio:
 
106
  if not input_text or not input_text.strip():
107
  return None, "โŒ Please enter text to convert!"
108
 
109
+ # Load models
110
+ if not load_models():
111
+ return None, "โŒ XTTS-v2 model failed to load!"
112
 
113
+ print("๐Ÿ“ Starting Text-to-Voice Cloning...")
 
 
 
114
 
 
 
115
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
116
  output_path = tmp_file.name
117
 
118
+ # Generate speech using XTTS-v2
119
+ TTS_MODEL.tts_to_file(
120
+ text=input_text,
121
+ speaker_wav=reference_audio,
122
+ language=language,
123
+ file_path=output_path
124
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ # Verify output
127
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
128
+ return output_path, f"โœ… Text-to-Voice Complete!\n๐Ÿ“ Generated: '{input_text[:100]}...'\n๐ŸŽญ Using reference voice characteristics\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: XTTS-v2"
129
  else:
130
  return None, "โŒ Generated audio file is empty!"
131
 
132
  except Exception as e:
133
+ error_msg = f"โŒ Text-to-Voice Error: {str(e)}"
134
+ print(error_msg)
135
+ return None, error_msg
136
 
137
+ # Try loading models at startup
138
+ startup_success = load_models()
139
+ startup_msg = "โœ… XTTS-v2 Ready for Voice Cloning!" if startup_success else "โš ๏ธ Models will load on first use"
 
 
 
 
140
 
141
+ # Create Gradio interface with BOTH functionalities
142
  with gr.Blocks(
143
+ title="๐ŸŽญ Voice Cloning Studio - XTTS-v2",
144
+ theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
145
  ) as demo:
146
 
 
147
  gr.HTML("""
148
  <div style="text-align: center; padding: 20px;">
149
+ <h1 style="color: #2E86AB;">๐ŸŽญ Voice Cloning Studio</h1>
150
+ <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
151
+ <p style="color: #888; font-size: 14px;">Powered by XTTS-v2 - Production Ready Open Source Model</p>
152
  </div>
153
  """)
154
 
155
+ # Status
156
+ status_color = "#d4edda" if startup_success else "#fff3cd"
157
  gr.HTML(f"""
158
+ <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
159
+ <strong>๐Ÿค– Model Status:</strong> {startup_msg}
160
  </div>
161
  """)
162
 
163
+ # Reference Voice (shared)
164
+ gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>๐ŸŽค Reference Voice (Voice to Clone)</h3>")
165
  reference_audio = gr.Audio(
166
+ label="Upload Reference Audio (6+ seconds recommended)",
167
  type="filepath",
168
  sources=["upload", "microphone"]
169
  )
 
170
 
171
+ # Tabs for different modes
172
  with gr.Tabs():
173
+ # VOICE-TO-VOICE CLONING TAB
174
  with gr.TabItem("๐ŸŽต Voice-to-Voice Cloning"):
175
  gr.HTML("""
176
+ <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
177
+ <h4>๐ŸŽค Voice-to-Voice Process:</h4>
178
+ <p><strong>1.</strong> Upload reference voice (person to clone)<br>
179
+ <strong>2.</strong> Upload input audio (speech content to transform)<br>
180
+ <strong>3.</strong> AI extracts text from input audio using Whisper<br>
181
+ <strong>4.</strong> XTTS-v2 generates new audio with reference voice + extracted content</p>
182
  </div>
183
  """)
184
 
 
190
  sources=["upload", "microphone"]
191
  )
192
 
193
+ voice_lang = gr.Dropdown(
194
+ choices=[
195
+ ("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
196
+ ("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
197
+ ("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
198
+ ("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
199
+ ("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
200
+ ("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
201
+ ("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
202
+ ("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja"),
203
+ ("๐Ÿ‡ฐ๐Ÿ‡ท Korean", "ko"),
204
+ ("๐Ÿ‡ท๐Ÿ‡บ Russian", "ru")
205
+ ],
206
+ value="en",
207
+ label="Language"
208
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
+ voice_btn = gr.Button(
211
  "๐ŸŽค Transform Voice (Audio โ†’ Cloned Audio)",
212
  variant="primary",
213
  size="lg"
214
  )
215
 
216
  with gr.Column():
217
+ voice_output = gr.Audio(label="Voice-to-Voice Result")
 
 
 
 
218
  voice_status = gr.Textbox(
219
  label="Voice-to-Voice Status",
220
  lines=6,
221
  interactive=False
222
  )
223
 
224
+ # TEXT-TO-VOICE CLONING TAB
225
  with gr.TabItem("๐Ÿ“ Text-to-Speech Cloning"):
226
  gr.HTML("""
227
  <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
228
+ <h4>๐Ÿ“ Text-to-Speech Process:</h4>
229
+ <p><strong>1.</strong> Upload reference voice (person to clone)<br>
230
+ <strong>2.</strong> Enter text to convert to speech<br>
231
+ <strong>3.</strong> XTTS-v2 generates speech directly in the cloned voice<br>
232
+ <strong>4.</strong> Download high-quality result</p>
233
  </div>
234
  """)
235
 
236
  with gr.Row():
237
  with gr.Column():
238
  text_input = gr.Textbox(
239
+ label="Text to Convert",
240
+ placeholder="Enter text to speak in the cloned voice...",
241
+ lines=5
 
242
  )
243
 
244
+ text_lang = gr.Dropdown(
245
+ choices=[
246
+ ("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
247
+ ("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
248
+ ("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
249
+ ("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
250
+ ("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
251
+ ("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
252
+ ("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
253
+ ("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja")
254
+ ],
255
+ value="en",
256
+ label="Language"
257
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
+ text_btn = gr.Button(
260
  "๐Ÿ“ Generate Speech (Text โ†’ Cloned Audio)",
261
  variant="secondary",
262
  size="lg"
263
  )
264
 
265
  with gr.Column():
266
+ text_output = gr.Audio(label="Text-to-Speech Result")
 
 
 
 
267
  text_status = gr.Textbox(
268
  label="Text-to-Speech Status",
269
  lines=6,
270
  interactive=False
271
  )
272
 
273
+ # Examples
274
  with gr.Accordion("๐Ÿ’ก Example Texts", open=False):
275
  examples = [
276
+ "Hello, this is a demonstration of AI voice cloning using XTTS-v2.",
277
+ "The weather today is absolutely beautiful, perfect for a walk in the park.",
278
+ "Artificial intelligence continues to revolutionize how we create and share content."
 
279
  ]
280
+ gr.Examples(examples=examples, inputs=text_input)
 
 
 
 
 
281
 
282
+ # Connect both functions - VOICE-TO-VOICE AND TEXT-TO-SPEECH
283
+ voice_btn.click(
284
+ fn=voice_to_voice_clone,
285
+ inputs=[reference_audio, input_audio, voice_lang],
286
+ outputs=[voice_output, voice_status],
287
  show_progress=True
288
  )
289
 
290
+ text_btn.click(
291
+ fn=text_to_voice_clone,
292
+ inputs=[reference_audio, text_input, text_lang],
293
+ outputs=[text_output, text_status],
294
  show_progress=True
295
  )
296
 
297
  if __name__ == "__main__":
298
+ demo.launch()