crackuser commited on
Commit
b3986a9
ยท
verified ยท
1 Parent(s): 27e1662

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -189
app.py CHANGED
@@ -3,280 +3,237 @@ import torch
3
  import torchaudio
4
  import tempfile
5
  import os
6
- import sys
7
- import traceback
8
 
9
- # Fix COQUI Terms of Service issue
10
  os.environ["COQUI_TOS_AGREED"] = "1"
11
  os.environ["COQUI_TOS"] = "1"
12
 
13
- # Device detection with fallbacks
14
- def get_device():
15
- if torch.cuda.is_available():
16
- return "cuda"
17
- elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
18
- return "cpu" # Force CPU for MPS compatibility issues
19
- else:
20
- return "cpu"
21
-
22
- DEVICE = get_device()
23
  print(f"๐Ÿš€ Using device: {DEVICE}")
24
 
25
  # Global models
26
  TTS_MODEL = None
27
  WHISPER_MODEL = None
28
- MODEL_TYPE = None
29
 
30
- def load_tts_models():
31
- """Load TTS models with comprehensive error handling and multiple fallbacks"""
32
- global TTS_MODEL, WHISPER_MODEL, MODEL_TYPE
 
 
 
33
 
34
- print("๐Ÿ”„ Starting model loading process...")
35
 
36
- # Method 1: Try XTTS-v2 (Primary)
37
- if TTS_MODEL is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  try:
39
- print("๐Ÿ“ฆ Attempting XTTS-v2 (Method 1: Direct API)...")
40
- from TTS.api import TTS
 
 
41
 
42
- # Force download and load
43
- TTS_MODEL = TTS(
44
- model_name="tts_models/multilingual/multi-dataset/xtts_v2",
45
- progress_bar=True,
46
- gpu=False if DEVICE == "cpu" else True
47
- ).to(DEVICE)
48
 
49
- MODEL_TYPE = "XTTS-v2"
50
- print("โœ… XTTS-v2 loaded successfully!")
 
 
 
51
 
52
- except Exception as e1:
53
- print(f"โŒ XTTS-v2 Method 1 failed: {e1}")
54
 
55
- # Method 2: Try manual XTTS loading
56
- try:
57
- print("๐Ÿ“ฆ Attempting XTTS-v2 (Method 2: Manual loading)...")
58
- from TTS.tts.configs.xtts_config import XttsConfig
59
- from TTS.tts.models.xtts import Xtts
60
-
61
- config = XttsConfig()
62
- config.load_json("https://huggingface.co/coqui/XTTS-v2/resolve/main/config.json")
63
- TTS_MODEL = Xtts.init_from_config(config)
64
- TTS_MODEL.load_checkpoint(
65
- config,
66
- checkpoint_path="https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth",
67
- eval=True
68
- )
69
- TTS_MODEL.to(DEVICE)
70
- MODEL_TYPE = "XTTS-v2-Manual"
71
- print("โœ… XTTS-v2 manual loading successful!")
72
-
73
- except Exception as e2:
74
- print(f"โŒ XTTS-v2 Method 2 failed: {e2}")
75
-
76
- # Method 3: Try fallback TTS model
77
- try:
78
- print("๐Ÿ“ฆ Attempting fallback TTS model...")
79
- from TTS.api import TTS
80
- TTS_MODEL = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=True).to(DEVICE)
81
- MODEL_TYPE = "Tacotron2-Fallback"
82
- print("โœ… Fallback TTS model loaded!")
83
-
84
- except Exception as e3:
85
- print(f"โŒ All TTS methods failed: {e3}")
86
- return False
87
 
88
  # Load Whisper for voice-to-voice
89
  if WHISPER_MODEL is None:
90
  try:
91
- print("๐Ÿ“ฆ Loading Whisper for voice-to-voice...")
92
  import whisper
93
  WHISPER_MODEL = whisper.load_model("base")
94
- print("โœ… Whisper loaded successfully!")
95
  except Exception as e:
96
- print(f"โš ๏ธ Whisper failed: {e}")
97
- print("๐Ÿ”„ Voice-to-voice will use fallback text")
98
 
99
- return TTS_MODEL is not None
100
 
101
- def voice_to_voice_clone(reference_audio, input_audio, language="en"):
102
  """
103
- ๐ŸŽค VOICE-TO-VOICE CLONING with robust error handling
104
  """
105
  try:
106
  if not reference_audio:
107
- return None, "โŒ Please upload reference audio (voice to clone)!"
108
 
109
  if not input_audio:
110
- return None, "โŒ Please upload input audio (content to transform)!"
111
 
112
  # Load models
113
- if not load_tts_models():
114
- return None, "โŒ All TTS models failed to load! Check your internet connection and try again."
115
 
116
- print("๐ŸŽค Starting Voice-to-Voice Cloning...")
117
 
118
- # Step 1: Extract text from input audio
119
  extracted_text = ""
120
  if WHISPER_MODEL:
121
  try:
122
  print("๐Ÿ“ Transcribing input audio with Whisper...")
123
  result = WHISPER_MODEL.transcribe(input_audio)
124
  extracted_text = result["text"].strip()
125
- print(f"โœ… Extracted: {extracted_text[:100]}...")
126
  except Exception as e:
127
  print(f"โš ๏ธ Whisper transcription failed: {e}")
128
- extracted_text = "Voice cloning demonstration using uploaded audio content."
129
  else:
130
- extracted_text = "Voice cloning demonstration using uploaded audio content."
131
- print("โš ๏ธ Using fallback text (Whisper not available)")
132
 
133
- if not extracted_text:
134
  extracted_text = "Hello, this is a voice cloning demonstration."
135
 
136
- # Step 2: Generate speech with reference voice
137
- print(f"๐ŸŽญ Generating speech with {MODEL_TYPE}...")
138
 
139
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
140
  output_path = tmp_file.name
141
 
142
- # Use appropriate TTS method based on model type
143
- if MODEL_TYPE == "XTTS-v2":
144
- TTS_MODEL.tts_to_file(
145
- text=extracted_text,
146
- speaker_wav=reference_audio,
147
- language=language,
148
- file_path=output_path
149
- )
150
- elif MODEL_TYPE == "XTTS-v2-Manual":
151
- # Manual XTTS inference
152
- gpt_cond_latent, speaker_embedding = TTS_MODEL.get_conditioning_latents(audio_path=[reference_audio])
153
- out = TTS_MODEL.inference(
154
- extracted_text,
155
- language,
156
- gpt_cond_latent,
157
- speaker_embedding,
158
- temperature=0.7
159
- )
160
- torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
161
- else:
162
- # Fallback model (limited voice cloning)
163
- TTS_MODEL.tts_to_file(text=extracted_text, file_path=output_path)
164
 
165
  # Verify output
166
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
167
- return output_path, f"โœ… Voice-to-Voice Cloning Complete!\n๐ŸŽค Original: '{extracted_text[:100]}...'\n๐ŸŽญ Model: {MODEL_TYPE}\n๐Ÿ“Š Language: {language}\n๐Ÿ”Š Voice characteristics applied from reference audio"
168
  else:
169
  return None, "โŒ Generated audio file is empty!"
170
 
171
  except Exception as e:
172
- error_msg = f"โŒ Voice-to-Voice Error: {str(e)}\n๐Ÿ” Model: {MODEL_TYPE}\n๐Ÿ“‹ Traceback:\n{traceback.format_exc()}"
173
- print(error_msg)
174
- return None, error_msg
175
 
176
- def text_to_voice_clone(reference_audio, input_text, language="en"):
177
  """
178
- ๐Ÿ“ TEXT-TO-VOICE CLONING with robust error handling
179
  """
180
  try:
181
  if not reference_audio:
182
- return None, "โŒ Please upload reference audio!"
183
 
184
  if not input_text or not input_text.strip():
185
- return None, "โŒ Please enter text to convert!"
186
 
187
  # Load models
188
- if not load_tts_models():
189
- return None, "โŒ All TTS models failed to load! Check your internet connection and try again."
190
 
191
  print("๐Ÿ“ Starting Text-to-Voice Cloning...")
192
 
193
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
194
  output_path = tmp_file.name
195
 
196
- # Generate speech using appropriate method
197
- if MODEL_TYPE == "XTTS-v2":
198
- TTS_MODEL.tts_to_file(
199
- text=input_text,
200
- speaker_wav=reference_audio,
201
- language=language,
202
- file_path=output_path
203
- )
204
- elif MODEL_TYPE == "XTTS-v2-Manual":
205
- # Manual XTTS inference
206
- gpt_cond_latent, speaker_embedding = TTS_MODEL.get_conditioning_latents(audio_path=[reference_audio])
207
- out = TTS_MODEL.inference(
208
- input_text,
209
- language,
210
- gpt_cond_latent,
211
- speaker_embedding,
212
- temperature=0.7
213
- )
214
- torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
215
- else:
216
- # Fallback model
217
- TTS_MODEL.tts_to_file(text=input_text, file_path=output_path)
218
 
219
  # Verify output
220
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
221
- return output_path, f"โœ… Text-to-Voice Complete!\n๐Ÿ“ Generated: '{input_text[:100]}...'\n๐ŸŽญ Model: {MODEL_TYPE}\n๐Ÿ“Š Language: {language}\n๐Ÿ”Š Voice characteristics applied from reference audio"
222
  else:
223
  return None, "โŒ Generated audio file is empty!"
224
 
225
  except Exception as e:
226
- error_msg = f"โŒ Text-to-Voice Error: {str(e)}\n๐Ÿ” Model: {MODEL_TYPE}\n๐Ÿ“‹ Traceback:\n{traceback.format_exc()}"
227
- print(error_msg)
228
- return None, error_msg
229
 
230
- # Try loading models at startup
231
- print("๐Ÿ”„ Initializing models at startup...")
232
- startup_success = load_tts_models()
233
- if startup_success:
234
- startup_msg = f"โœ… {MODEL_TYPE} Ready for Voice Cloning!"
235
- startup_color = "#d4edda"
236
- else:
237
- startup_msg = "โš ๏ธ Models will load on first use (may take 2-3 minutes)"
238
- startup_color = "#fff3cd"
239
 
240
- # Create Gradio interface
241
  with gr.Blocks(
242
- title="๐ŸŽญ Voice Cloning Studio - Production Ready",
243
  theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
244
  ) as demo:
245
 
246
  gr.HTML("""
247
  <div style="text-align: center; padding: 20px;">
248
- <h1 style="color: #2E86AB;">๐ŸŽญ Voice Cloning Studio</h1>
249
  <p style="color: #666; font-size: 18px;">Professional Voice-to-Voice & Text-to-Speech Cloning</p>
250
- <p style="color: #888; font-size: 14px;">Multi-Model Support: XTTS-v2 + Fallbacks | Production Ready</p>
251
  </div>
252
  """)
253
 
254
- # Dynamic status
255
  gr.HTML(f"""
256
- <div style="text-align: center; padding: 15px; background: {startup_color}; border-radius: 10px; margin-bottom: 20px;">
257
- <strong>๐Ÿค– Model Status:</strong> {startup_msg}
258
  </div>
259
  """)
260
 
261
- # Reference Voice (shared)
262
  gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>๐ŸŽค Reference Voice (Voice to Clone)</h3>")
263
  reference_audio = gr.Audio(
264
  label="Upload Reference Audio (6+ seconds of clear speech)",
265
  type="filepath",
266
  sources=["upload", "microphone"]
267
  )
 
268
 
269
- # Tabs for different modes
270
  with gr.Tabs():
271
  # VOICE-TO-VOICE CLONING TAB
272
  with gr.TabItem("๐ŸŽต Voice-to-Voice Cloning"):
273
  gr.HTML("""
274
- <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
275
- <h4 style="color: #1e40af;">๐ŸŽค Voice-to-Voice Process:</h4>
276
- <p><strong>Step 1:</strong> Upload reference voice (person to clone)<br>
277
- <strong>Step 2:</strong> Upload input audio (speech content to transform)<br>
278
- <strong>Step 3:</strong> AI extracts text from input using Whisper<br>
279
- <strong>Step 4:</strong> Generate new audio with reference voice + extracted content</p>
 
 
280
  </div>
281
  """)
282
 
@@ -297,7 +254,9 @@ with gr.Blocks(
297
  ("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
298
  ("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
299
  ("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
300
- ("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja")
 
 
301
  ],
302
  value="en",
303
  label="Language"
@@ -312,7 +271,7 @@ with gr.Blocks(
312
  with gr.Column():
313
  voice_output = gr.Audio(label="Voice-to-Voice Result")
314
  voice_status = gr.Textbox(
315
- label="Voice-to-Voice Status",
316
  lines=8,
317
  interactive=False
318
  )
@@ -320,12 +279,14 @@ with gr.Blocks(
320
  # TEXT-TO-VOICE CLONING TAB
321
  with gr.TabItem("๐Ÿ“ Text-to-Speech Cloning"):
322
  gr.HTML("""
323
- <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
324
- <h4 style="color: #16a34a;">๐Ÿ“ Text-to-Speech Process:</h4>
325
- <p><strong>Step 1:</strong> Upload reference voice (person to clone)<br>
326
- <strong>Step 2:</strong> Enter text to convert to speech<br>
327
- <strong>Step 3:</strong> AI generates speech in the cloned voice<br>
328
- <strong>Step 4:</strong> Download high-quality result</p>
 
 
329
  </div>
330
  """)
331
 
@@ -334,7 +295,8 @@ with gr.Blocks(
334
  text_input = gr.Textbox(
335
  label="Text to Convert to Speech",
336
  placeholder="Enter text to speak in the cloned voice...",
337
- lines=5
 
338
  )
339
 
340
  text_lang = gr.Dropdown(
@@ -361,36 +323,38 @@ with gr.Blocks(
361
  with gr.Column():
362
  text_output = gr.Audio(label="Text-to-Speech Result")
363
  text_status = gr.Textbox(
364
- label="Text-to-Speech Status",
365
  lines=8,
366
  interactive=False
367
  )
368
 
369
  # Examples and Help
370
- with gr.Accordion("๐Ÿ’ก Example Texts & Troubleshooting", open=False):
371
  gr.Markdown("""
372
- ### Example Texts
373
- - "Hello, this is a demonstration of AI voice cloning using advanced models."
374
- - "The weather today is absolutely beautiful, perfect for a walk in the park."
375
- - "Artificial intelligence continues to revolutionize how we create content."
376
 
377
- ### Troubleshooting
378
- - **Model Loading Issues**: Wait 2-3 minutes on first use for model download
379
- - **Voice Quality**: Use clear, 6+ second reference audio with minimal background noise
380
- - **Language Support**: XTTS-v2 supports 16+ languages with cross-lingual cloning
381
- - **Processing Time**: Voice cloning takes 10-60 seconds depending on text length
 
 
382
  """)
383
 
384
- # Event handlers - BOTH FUNCTIONALITIES CONNECTED
385
  voice_btn.click(
386
- fn=voice_to_voice_clone,
387
  inputs=[reference_audio, input_audio, voice_lang],
388
  outputs=[voice_output, voice_status],
389
  show_progress=True
390
  )
391
 
392
  text_btn.click(
393
- fn=text_to_voice_clone,
394
  inputs=[reference_audio, text_input, text_lang],
395
  outputs=[text_output, text_status],
396
  show_progress=True
 
3
  import torchaudio
4
  import tempfile
5
  import os
6
+ import warnings
7
+ warnings.filterwarnings("ignore")
8
 
9
+ # CRITICAL: Set COQUI Terms of Service agreement
10
  os.environ["COQUI_TOS_AGREED"] = "1"
11
  os.environ["COQUI_TOS"] = "1"
12
 
13
+ # Device setup
14
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
15
  print(f"๐Ÿš€ Using device: {DEVICE}")
16
 
17
  # Global models
18
  TTS_MODEL = None
19
  WHISPER_MODEL = None
20
+ MODEL_LOADED = False
21
 
22
+ def load_xtts_model():
23
+ """Load XTTS-v2 with comprehensive error handling"""
24
+ global TTS_MODEL, WHISPER_MODEL, MODEL_LOADED
25
+
26
+ if MODEL_LOADED and TTS_MODEL is not None:
27
+ return True
28
 
29
+ print("๐Ÿ”„ Loading XTTS-v2 model...")
30
 
31
+ try:
32
+ # Method 1: Direct TTS API (Most Reliable)
33
+ print("๐Ÿ“ฆ Attempting direct TTS API loading...")
34
+ from TTS.api import TTS
35
+
36
+ TTS_MODEL = TTS(
37
+ model_name="tts_models/multilingual/multi-dataset/xtts_v2",
38
+ progress_bar=True,
39
+ gpu=(DEVICE == "cuda")
40
+ )
41
+
42
+ if DEVICE == "cuda":
43
+ TTS_MODEL = TTS_MODEL.to("cuda")
44
+
45
+ print("โœ… XTTS-v2 loaded successfully via TTS API!")
46
+ MODEL_LOADED = True
47
+
48
+ except Exception as e1:
49
+ print(f"โŒ Direct API failed: {e1}")
50
+
51
  try:
52
+ # Method 2: Manual Configuration Loading
53
+ print("๐Ÿ“ฆ Attempting manual XTTS configuration...")
54
+ from TTS.tts.configs.xtts_config import XttsConfig
55
+ from TTS.tts.models.xtts import Xtts
56
 
57
+ # Load config
58
+ config = XttsConfig()
59
+ model_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2")
 
 
 
60
 
61
+ if not os.path.exists(model_path):
62
+ print("๐Ÿ”„ Downloading XTTS-v2 model files...")
63
+ # Force download via API first
64
+ temp_tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True)
65
+ del temp_tts
66
 
67
+ config_path = os.path.join(model_path, "config.json")
68
+ config.load_json(config_path)
69
 
70
+ # Initialize model
71
+ TTS_MODEL = Xtts.init_from_config(config)
72
+ TTS_MODEL.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
73
+ TTS_MODEL.to(DEVICE)
74
+
75
+ print("โœ… XTTS-v2 loaded via manual configuration!")
76
+ MODEL_LOADED = True
77
+
78
+ except Exception as e2:
79
+ print(f"โŒ Manual loading failed: {e2}")
80
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  # Load Whisper for voice-to-voice
83
  if WHISPER_MODEL is None:
84
  try:
85
+ print("๐Ÿ“ฆ Loading Whisper for audio transcription...")
86
  import whisper
87
  WHISPER_MODEL = whisper.load_model("base")
88
+ print("โœ… Whisper loaded!")
89
  except Exception as e:
90
+ print(f"โš ๏ธ Whisper loading failed: {e}")
 
91
 
92
+ return MODEL_LOADED
93
 
94
+ def voice_to_voice_cloning(reference_audio, input_audio, language="en"):
95
  """
96
+ ๐ŸŽค REAL VOICE-TO-VOICE CLONING IMPLEMENTATION
97
  """
98
  try:
99
  if not reference_audio:
100
+ return None, "โŒ Upload reference audio (voice to clone)!"
101
 
102
  if not input_audio:
103
+ return None, "โŒ Upload input audio (content to transform)!"
104
 
105
  # Load models
106
+ if not load_xtts_model():
107
+ return None, "โŒ XTTS-v2 failed to load! Check your internet connection and try restarting the space."
108
 
109
+ print("๐ŸŽค Starting Voice-to-Voice Cloning Process...")
110
 
111
+ # Step 1: Extract text from input audio using Whisper
112
  extracted_text = ""
113
  if WHISPER_MODEL:
114
  try:
115
  print("๐Ÿ“ Transcribing input audio with Whisper...")
116
  result = WHISPER_MODEL.transcribe(input_audio)
117
  extracted_text = result["text"].strip()
118
+ print(f"โœ… Extracted text: {extracted_text[:100]}...")
119
  except Exception as e:
120
  print(f"โš ๏ธ Whisper transcription failed: {e}")
121
+ extracted_text = "This is a voice cloning demonstration using the uploaded audio content."
122
  else:
123
+ extracted_text = "This is a voice cloning demonstration using the uploaded audio content."
 
124
 
125
+ if not extracted_text or len(extracted_text) < 3:
126
  extracted_text = "Hello, this is a voice cloning demonstration."
127
 
128
+ # Step 2: Generate new audio with reference voice using XTTS-v2
129
+ print("๐ŸŽญ Generating speech with cloned voice...")
130
 
131
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
132
  output_path = tmp_file.name
133
 
134
+ # Use XTTS-v2 for voice cloning
135
+ TTS_MODEL.tts_to_file(
136
+ text=extracted_text,
137
+ speaker_wav=reference_audio,
138
+ language=language,
139
+ file_path=output_path
140
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  # Verify output
143
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
144
+ return output_path, f"โœ… Voice-to-Voice Cloning Complete!\n\n๐ŸŽค Original content: '{extracted_text[:150]}...'\n\n๐ŸŽญ Applied reference voice characteristics\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: XTTS-v2\nโฑ๏ธ Processing completed successfully"
145
  else:
146
  return None, "โŒ Generated audio file is empty!"
147
 
148
  except Exception as e:
149
+ return None, f"โŒ Voice-to-Voice Error: {str(e)}"
 
 
150
 
151
+ def text_to_voice_cloning(reference_audio, input_text, language="en"):
152
  """
153
+ ๐Ÿ“ REAL TEXT-TO-VOICE CLONING IMPLEMENTATION
154
  """
155
  try:
156
  if not reference_audio:
157
+ return None, "โŒ Upload reference audio!"
158
 
159
  if not input_text or not input_text.strip():
160
+ return None, "โŒ Enter text to convert!"
161
 
162
  # Load models
163
+ if not load_xtts_model():
164
+ return None, "โŒ XTTS-v2 failed to load! Check your internet connection and try restarting the space."
165
 
166
  print("๐Ÿ“ Starting Text-to-Voice Cloning...")
167
 
168
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
169
  output_path = tmp_file.name
170
 
171
+ # Generate speech using XTTS-v2
172
+ TTS_MODEL.tts_to_file(
173
+ text=input_text,
174
+ speaker_wav=reference_audio,
175
+ language=language,
176
+ file_path=output_path
177
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  # Verify output
180
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
181
+ return output_path, f"โœ… Text-to-Voice Complete!\n\n๐Ÿ“ Generated: '{input_text[:150]}...'\n\n๐ŸŽญ Using reference voice characteristics\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: XTTS-v2\nโฑ๏ธ Processing completed successfully"
182
  else:
183
  return None, "โŒ Generated audio file is empty!"
184
 
185
  except Exception as e:
186
+ return None, f"โŒ Text-to-Voice Error: {str(e)}"
 
 
187
 
188
+ # Initialize models at startup
189
+ print("๐Ÿ”„ Initializing XTTS-v2 at startup...")
190
+ startup_success = load_xtts_model()
191
+ status_msg = "โœ… XTTS-v2 Ready!" if startup_success else "โš ๏ธ XTTS-v2 will load on first use (2-3 minutes)"
192
+ status_color = "#d4edda" if startup_success else "#fff3cd"
 
 
 
 
193
 
194
+ # Create Gradio Interface
195
  with gr.Blocks(
196
+ title="๐ŸŽญ XTTS-v2 Voice Cloning Studio",
197
  theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
198
  ) as demo:
199
 
200
  gr.HTML("""
201
  <div style="text-align: center; padding: 20px;">
202
+ <h1 style="color: #2E86AB;">๐ŸŽญ XTTS-v2 Voice Cloning Studio</h1>
203
  <p style="color: #666; font-size: 18px;">Professional Voice-to-Voice & Text-to-Speech Cloning</p>
204
+ <p style="color: #888; font-size: 14px;">Powered by Coqui XTTS-v2 - Production Ready Open Source</p>
205
  </div>
206
  """)
207
 
208
+ # Dynamic Status Display
209
  gr.HTML(f"""
210
+ <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
211
+ <strong>๐Ÿค– XTTS-v2 Status:</strong> {status_msg}
212
  </div>
213
  """)
214
 
215
+ # Shared Reference Voice
216
  gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>๐ŸŽค Reference Voice (Voice to Clone)</h3>")
217
  reference_audio = gr.Audio(
218
  label="Upload Reference Audio (6+ seconds of clear speech)",
219
  type="filepath",
220
  sources=["upload", "microphone"]
221
  )
222
+ gr.HTML("<p style='color: #666; text-align: center; margin-bottom: 20px;'>๐Ÿ“Œ This voice will be cloned and applied to your content</p>")
223
 
224
+ # Main Functionality Tabs
225
  with gr.Tabs():
226
  # VOICE-TO-VOICE CLONING TAB
227
  with gr.TabItem("๐ŸŽต Voice-to-Voice Cloning"):
228
  gr.HTML("""
229
+ <div style="padding: 20px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
230
+ <h4 style="color: #1e40af; margin-bottom: 10px;">๐ŸŽค Voice-to-Voice Process:</h4>
231
+ <ul style="margin: 0; padding-left: 20px;">
232
+ <li><strong>Step 1:</strong> Upload reference voice (person to clone)</li>
233
+ <li><strong>Step 2:</strong> Upload input audio (speech content to transform)</li>
234
+ <li><strong>Step 3:</strong> Whisper AI extracts text content from input</li>
235
+ <li><strong>Step 4:</strong> XTTS-v2 generates new audio with reference voice + extracted content</li>
236
+ </ul>
237
  </div>
238
  """)
239
 
 
254
  ("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
255
  ("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
256
  ("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
257
+ ("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja"),
258
+ ("๐Ÿ‡ฐ๐Ÿ‡ท Korean", "ko"),
259
+ ("๐Ÿ‡ท๐Ÿ‡บ Russian", "ru")
260
  ],
261
  value="en",
262
  label="Language"
 
271
  with gr.Column():
272
  voice_output = gr.Audio(label="Voice-to-Voice Result")
273
  voice_status = gr.Textbox(
274
+ label="Voice-to-Voice Status & Details",
275
  lines=8,
276
  interactive=False
277
  )
 
279
  # TEXT-TO-VOICE CLONING TAB
280
  with gr.TabItem("๐Ÿ“ Text-to-Speech Cloning"):
281
  gr.HTML("""
282
+ <div style="padding: 20px; background: #f0fff0; border-radius: 10px; margin-bottom: 20px;">
283
+ <h4 style="color: #16a34a; margin-bottom: 10px;">๐Ÿ“ Text-to-Speech Process:</h4>
284
+ <ul style="margin: 0; padding-left: 20px;">
285
+ <li><strong>Step 1:</strong> Upload reference voice (person to clone)</li>
286
+ <li><strong>Step 2:</strong> Enter text to convert to speech</li>
287
+ <li><strong>Step 3:</strong> XTTS-v2 generates speech in the cloned voice</li>
288
+ <li><strong>Step 4:</strong> Download high-quality audio result</li>
289
+ </ul>
290
  </div>
291
  """)
292
 
 
295
  text_input = gr.Textbox(
296
  label="Text to Convert to Speech",
297
  placeholder="Enter text to speak in the cloned voice...",
298
+ lines=6,
299
+ max_lines=10
300
  )
301
 
302
  text_lang = gr.Dropdown(
 
323
  with gr.Column():
324
  text_output = gr.Audio(label="Text-to-Speech Result")
325
  text_status = gr.Textbox(
326
+ label="Text-to-Speech Status & Details",
327
  lines=8,
328
  interactive=False
329
  )
330
 
331
  # Examples and Help
332
+ with gr.Accordion("๐Ÿ’ก Examples & Troubleshooting", open=False):
333
  gr.Markdown("""
334
+ ### ๐Ÿ“ Example Texts to Try
335
+ - "Hello, this is a demonstration of AI voice cloning using XTTS-v2 technology."
336
+ - "The weather today is absolutely beautiful, perfect for a relaxing walk in the park."
337
+ - "Artificial intelligence continues to revolutionize how we create and share digital content."
338
 
339
+ ### ๐Ÿ”ง Troubleshooting Guide
340
+ - **First Use**: Model loading takes 2-3 minutes for initial download
341
+ - **Reference Audio**: Use 6+ seconds of clear, single-speaker audio
342
+ - **Audio Quality**: Minimize background noise for best results
343
+ - **Languages**: XTTS-v2 supports 16+ languages with cross-lingual cloning
344
+ - **Processing Time**: Voice cloning takes 15-90 seconds depending on text length
345
+ - **Restart**: If models fail to load, restart the space and try again
346
  """)
347
 
348
+ # Event Handlers - Connect Both Functions
349
  voice_btn.click(
350
+ fn=voice_to_voice_cloning,
351
  inputs=[reference_audio, input_audio, voice_lang],
352
  outputs=[voice_output, voice_status],
353
  show_progress=True
354
  )
355
 
356
  text_btn.click(
357
+ fn=text_to_voice_cloning,
358
  inputs=[reference_audio, text_input, text_lang],
359
  outputs=[text_output, text_status],
360
  show_progress=True