crackuser commited on
Commit
825c475
Β·
verified Β·
1 Parent(s): 82bac76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -66
app.py CHANGED
@@ -5,6 +5,7 @@ import tempfile
5
  import os
6
  import warnings
7
  from contextlib import contextmanager
 
8
 
9
  warnings.filterwarnings("ignore")
10
 
@@ -27,17 +28,8 @@ def patch_torch_load():
27
  finally:
28
  torch.load = original_load
29
 
30
- # Device setup with safety
31
- def get_device():
32
- if torch.cuda.is_available():
33
- try:
34
- torch.cuda.init()
35
- return "cuda"
36
- except:
37
- return "cpu"
38
- return "cpu"
39
-
40
- DEVICE = get_device()
41
  print(f"πŸš€ Using device: {DEVICE}")
42
 
43
  # Global variables
@@ -46,7 +38,7 @@ WHISPER_MODEL = None
46
  MODEL_STATUS = "Not Loaded"
47
 
48
  def load_models():
49
- """Load models with comprehensive error handling"""
50
  global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS
51
 
52
  print("πŸ”„ Loading models...")
@@ -58,7 +50,6 @@ def load_models():
58
  from TTS.api import TTS
59
  print("πŸ“¦ Loading XTTS-v2...")
60
 
61
- # CORRECT model name
62
  TTS_MODEL = TTS(
63
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
64
  progress_bar=True,
@@ -68,6 +59,12 @@ def load_models():
68
  MODEL_STATUS = "XTTS-v2 Ready"
69
  print("βœ… XTTS-v2 loaded successfully!")
70
 
 
 
 
 
 
 
71
  except Exception as e:
72
  print(f"❌ XTTS-v2 loading failed: {e}")
73
  MODEL_STATUS = f"XTTS Load Failed: {str(e)}"
@@ -86,7 +83,9 @@ def load_models():
86
  return TTS_MODEL is not None
87
 
88
  def voice_to_voice_clone(reference_audio, input_audio, language="en"):
89
- """Voice-to-voice cloning with robust error handling"""
 
 
90
  try:
91
  # Input validation
92
  if not reference_audio:
@@ -118,39 +117,73 @@ def voice_to_voice_clone(reference_audio, input_audio, language="en"):
118
  except Exception as e:
119
  print(f"⚠️ Whisper transcription failed: {e}")
120
 
121
- # Generate speech with reference voice
122
- print("🎭 Generating speech with cloned voice...")
123
-
124
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
125
- output_path = tmp_file.name
126
 
127
  try:
128
- # Use XTTS API with error handling
129
  with patch_torch_load():
130
- TTS_MODEL.tts_to_file(
 
131
  text=extracted_text,
132
  speaker_wav=reference_audio,
133
- language=language,
134
- file_path=output_path
135
  )
136
-
137
- # Verify output
138
- if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
139
- return output_path, f"βœ… Voice-to-Voice Complete!\n\n🎀 Content: '{extracted_text[:150]}...'\n🎭 Applied reference voice\nπŸ“Š Language: {language}\nπŸ€– Model: {MODEL_STATUS}"
140
- else:
141
- return None, "❌ Generated audio file is empty!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  except Exception as gen_error:
144
- # Clean up file on error
145
- if os.path.exists(output_path):
146
- os.unlink(output_path)
147
- return None, f"❌ Generation failed: {str(gen_error)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  except Exception as e:
150
  return None, f"❌ Voice-to-Voice Error: {str(e)}"
151
 
152
  def text_to_voice_clone(reference_audio, input_text, language="en"):
153
- """Text-to-voice cloning with robust error handling"""
 
 
154
  try:
155
  # Input validation
156
  if not reference_audio:
@@ -165,37 +198,68 @@ def text_to_voice_clone(reference_audio, input_text, language="en"):
165
  if not load_models():
166
  return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}"
167
 
168
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
169
- output_path = tmp_file.name
170
 
171
  try:
172
- print(f"🎭 Generating speech: '{input_text[:100]}...'")
173
-
174
- # Generate speech
175
  with patch_torch_load():
176
- TTS_MODEL.tts_to_file(
 
177
  text=input_text,
178
  speaker_wav=reference_audio,
179
- language=language,
180
- file_path=output_path
181
  )
182
-
183
- # Verify output
184
- if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
185
- return output_path, f"βœ… Text-to-Voice Complete!\n\nπŸ“ Generated: '{input_text[:150]}...'\n🎭 Using reference voice\nπŸ“Š Language: {language}\nπŸ€– Model: {MODEL_STATUS}"
186
- else:
187
- return None, "❌ Generated audio file is empty!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  except Exception as gen_error:
190
- # Clean up file on error
191
- if os.path.exists(output_path):
192
- os.unlink(output_path)
193
- return None, f"❌ Generation failed: {str(gen_error)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  except Exception as e:
196
  return None, f"❌ Text-to-Voice Error: {str(e)}"
197
 
198
- # Initialize at startup with error handling
199
  print("πŸ”„ Initializing models at startup...")
200
  try:
201
  startup_success = load_models()
@@ -203,7 +267,7 @@ try:
203
  startup_msg = f"βœ… {MODEL_STATUS}!"
204
  startup_color = "#d4edda"
205
  else:
206
- startup_msg = f"⚠️ Models will load on first use - Status: {MODEL_STATUS}"
207
  startup_color = "#fff3cd"
208
  except Exception as e:
209
  startup_success = False
@@ -214,7 +278,7 @@ print(f"Startup status: {startup_msg}")
214
 
215
  # Create Gradio Interface
216
  with gr.Blocks(
217
- title="🎭 Voice Cloning Studio",
218
  theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
219
  ) as demo:
220
 
@@ -222,7 +286,7 @@ with gr.Blocks(
222
  <div style="text-align: center; padding: 20px;">
223
  <h1 style="color: #2E86AB;">🎭 Voice Cloning Studio</h1>
224
  <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
225
- <p style="color: #888; font-size: 14px;">Production Ready - Error-Free Implementation</p>
226
  </div>
227
  """)
228
 
@@ -247,12 +311,12 @@ with gr.Blocks(
247
  with gr.TabItem("🎡 Voice-to-Voice Cloning"):
248
  gr.HTML("""
249
  <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
250
- <h4 style="color: #1e40af;">🎀 How it works:</h4>
251
- <ol style="margin: 5px 0; padding-left: 20px;">
252
- <li>Upload reference voice (person to clone)</li>
253
- <li>Upload input audio (content to transform)</li>
254
- <li>AI extracts text and applies reference voice</li>
255
- </ol>
256
  </div>
257
  """)
258
 
@@ -276,7 +340,7 @@ with gr.Blocks(
276
  )
277
 
278
  voice_btn = gr.Button(
279
- "🎀 Clone Voice",
280
  variant="primary",
281
  size="lg"
282
  )
@@ -285,7 +349,7 @@ with gr.Blocks(
285
  voice_output = gr.Audio(label="Cloned Voice Result")
286
  voice_status = gr.Textbox(
287
  label="Status",
288
- lines=6,
289
  interactive=False
290
  )
291
 
@@ -311,7 +375,7 @@ with gr.Blocks(
311
  )
312
 
313
  text_btn = gr.Button(
314
- "πŸ“ Generate Speech",
315
  variant="secondary",
316
  size="lg"
317
  )
@@ -320,10 +384,39 @@ with gr.Blocks(
320
  text_output = gr.Audio(label="Generated Speech")
321
  text_status = gr.Textbox(
322
  label="Status",
323
- lines=6,
324
  interactive=False
325
  )
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  # Event handlers
328
  voice_btn.click(
329
  fn=voice_to_voice_clone,
 
5
  import os
6
  import warnings
7
  from contextlib import contextmanager
8
+ import numpy as np
9
 
10
  warnings.filterwarnings("ignore")
11
 
 
28
  finally:
29
  torch.load = original_load
30
 
31
+ # Device setup
32
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
33
  print(f"πŸš€ Using device: {DEVICE}")
34
 
35
  # Global variables
 
38
  MODEL_STATUS = "Not Loaded"
39
 
40
  def load_models():
41
+ """Load models with correct error handling"""
42
  global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS
43
 
44
  print("πŸ”„ Loading models...")
 
50
  from TTS.api import TTS
51
  print("πŸ“¦ Loading XTTS-v2...")
52
 
 
53
  TTS_MODEL = TTS(
54
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
55
  progress_bar=True,
 
59
  MODEL_STATUS = "XTTS-v2 Ready"
60
  print("βœ… XTTS-v2 loaded successfully!")
61
 
62
+ # CRITICAL: Verify the model has the correct methods
63
+ if hasattr(TTS_MODEL, 'tts') and hasattr(TTS_MODEL, 'tts_to_file'):
64
+ print("βœ… Verified: TTS model has correct API methods")
65
+ else:
66
+ print("❌ Warning: TTS model missing expected methods")
67
+
68
  except Exception as e:
69
  print(f"❌ XTTS-v2 loading failed: {e}")
70
  MODEL_STATUS = f"XTTS Load Failed: {str(e)}"
 
83
  return TTS_MODEL is not None
84
 
85
  def voice_to_voice_clone(reference_audio, input_audio, language="en"):
86
+ """
87
+ CORRECTED: Uses tts() method instead of generate()
88
+ """
89
  try:
90
  # Input validation
91
  if not reference_audio:
 
117
  except Exception as e:
118
  print(f"⚠️ Whisper transcription failed: {e}")
119
 
120
+ # CRITICAL FIX: Use tts() method, not generate()
121
+ print("🎭 Generating speech with CORRECT XTTS API...")
 
 
 
122
 
123
  try:
 
124
  with patch_torch_load():
125
+ # METHOD 1: Use tts() method that returns numpy array
126
+ wav_array = TTS_MODEL.tts(
127
  text=extracted_text,
128
  speaker_wav=reference_audio,
129
+ language=language
 
130
  )
131
+
132
+ print(f"βœ… Generated audio array with shape: {np.array(wav_array).shape}")
133
+
134
+ # Convert numpy array to tensor and save
135
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
136
+ output_path = tmp_file.name
137
+
138
+ # Convert to tensor and save
139
+ if isinstance(wav_array, np.ndarray):
140
+ wav_tensor = torch.tensor(wav_array, dtype=torch.float32).unsqueeze(0)
141
+ else:
142
+ wav_tensor = torch.tensor(wav_array, dtype=torch.float32)
143
+ if wav_tensor.dim() == 1:
144
+ wav_tensor = wav_tensor.unsqueeze(0)
145
+
146
+ # Save with correct sample rate
147
+ sample_rate = getattr(TTS_MODEL, 'synthesizer', {}).get('output_sample_rate', 24000) or 24000
148
+ torchaudio.save(output_path, wav_tensor, sample_rate)
149
+
150
+ # Verify output
151
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
152
+ return output_path, f"βœ… Voice-to-Voice Complete!\n\n🎀 Content: '{extracted_text[:150]}...'\n🎭 Applied reference voice\nπŸ“Š Language: {language}\nπŸ€– Model: {MODEL_STATUS}\nπŸ”§ Used: tts() method (CORRECT API)"
153
+ else:
154
+ return None, "❌ Generated audio file is empty!"
155
 
156
  except Exception as gen_error:
157
+ # Fallback: Try tts_to_file method
158
+ try:
159
+ print("πŸ”„ Trying fallback method: tts_to_file()...")
160
+
161
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
162
+ output_path = tmp_file.name
163
+
164
+ with patch_torch_load():
165
+ TTS_MODEL.tts_to_file(
166
+ text=extracted_text,
167
+ speaker_wav=reference_audio,
168
+ language=language,
169
+ file_path=output_path
170
+ )
171
+
172
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
173
+ return output_path, f"βœ… Voice-to-Voice Complete (Fallback)!\n\n🎀 Content: '{extracted_text[:150]}...'\n🎭 Applied reference voice\nπŸ“Š Language: {language}\nπŸ€– Model: {MODEL_STATUS}\nπŸ”§ Used: tts_to_file() method"
174
+ else:
175
+ return None, "❌ Generated audio file is empty!"
176
+
177
+ except Exception as fallback_error:
178
+ return None, f"❌ Generation failed:\nPrimary error: {str(gen_error)}\nFallback error: {str(fallback_error)}\n\nTip: The model doesn't have a 'generate()' method. Use 'tts()' or 'tts_to_file()' instead."
179
 
180
  except Exception as e:
181
  return None, f"❌ Voice-to-Voice Error: {str(e)}"
182
 
183
  def text_to_voice_clone(reference_audio, input_text, language="en"):
184
+ """
185
+ CORRECTED: Uses tts() method instead of generate()
186
+ """
187
  try:
188
  # Input validation
189
  if not reference_audio:
 
198
  if not load_models():
199
  return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}"
200
 
201
+ print(f"🎭 Generating speech: '{input_text[:100]}...'")
 
202
 
203
  try:
 
 
 
204
  with patch_torch_load():
205
+ # METHOD 1: Use tts() method that returns numpy array
206
+ wav_array = TTS_MODEL.tts(
207
  text=input_text,
208
  speaker_wav=reference_audio,
209
+ language=language
 
210
  )
211
+
212
+ print(f"βœ… Generated audio array with shape: {np.array(wav_array).shape}")
213
+
214
+ # Convert numpy array to tensor and save
215
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
216
+ output_path = tmp_file.name
217
+
218
+ # Convert to tensor and save
219
+ if isinstance(wav_array, np.ndarray):
220
+ wav_tensor = torch.tensor(wav_array, dtype=torch.float32).unsqueeze(0)
221
+ else:
222
+ wav_tensor = torch.tensor(wav_array, dtype=torch.float32)
223
+ if wav_tensor.dim() == 1:
224
+ wav_tensor = wav_tensor.unsqueeze(0)
225
+
226
+ # Save with correct sample rate
227
+ sample_rate = getattr(TTS_MODEL, 'synthesizer', {}).get('output_sample_rate', 24000) or 24000
228
+ torchaudio.save(output_path, wav_tensor, sample_rate)
229
+
230
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
231
+ return output_path, f"βœ… Text-to-Voice Complete!\n\nπŸ“ Generated: '{input_text[:150]}...'\n🎭 Using reference voice\nπŸ“Š Language: {language}\nπŸ€– Model: {MODEL_STATUS}\nπŸ”§ Used: tts() method (CORRECT API)"
232
+ else:
233
+ return None, "❌ Generated audio file is empty!"
234
 
235
  except Exception as gen_error:
236
+ # Fallback: Try tts_to_file method
237
+ try:
238
+ print("πŸ”„ Trying fallback method: tts_to_file()...")
239
+
240
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
241
+ output_path = tmp_file.name
242
+
243
+ with patch_torch_load():
244
+ TTS_MODEL.tts_to_file(
245
+ text=input_text,
246
+ speaker_wav=reference_audio,
247
+ language=language,
248
+ file_path=output_path
249
+ )
250
+
251
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
252
+ return output_path, f"βœ… Text-to-Voice Complete (Fallback)!\n\nπŸ“ Generated: '{input_text[:150]}...'\n🎭 Using reference voice\nπŸ“Š Language: {language}\nπŸ€– Model: {MODEL_STATUS}\nπŸ”§ Used: tts_to_file() method"
253
+ else:
254
+ return None, "❌ Generated audio file is empty!"
255
+
256
+ except Exception as fallback_error:
257
+ return None, f"❌ Generation failed:\nPrimary error: {str(gen_error)}\nFallback error: {str(fallback_error)}\n\nTip: The model doesn't have a 'generate()' method. Use 'tts()' or 'tts_to_file()' instead."
258
 
259
  except Exception as e:
260
  return None, f"❌ Text-to-Voice Error: {str(e)}"
261
 
262
+ # Initialize at startup
263
  print("πŸ”„ Initializing models at startup...")
264
  try:
265
  startup_success = load_models()
 
267
  startup_msg = f"βœ… {MODEL_STATUS}!"
268
  startup_color = "#d4edda"
269
  else:
270
+ startup_msg = f"⚠️ Models will load on first use - {MODEL_STATUS}"
271
  startup_color = "#fff3cd"
272
  except Exception as e:
273
  startup_success = False
 
278
 
279
  # Create Gradio Interface
280
  with gr.Blocks(
281
+ title="🎭 Voice Cloning Studio - API Fixed",
282
  theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
283
  ) as demo:
284
 
 
286
  <div style="text-align: center; padding: 20px;">
287
  <h1 style="color: #2E86AB;">🎭 Voice Cloning Studio</h1>
288
  <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
289
+ <p style="color: #888; font-size: 14px;">Fixed: Uses tts() method instead of generate() - No More API Errors!</p>
290
  </div>
291
  """)
292
 
 
311
  with gr.TabItem("🎡 Voice-to-Voice Cloning"):
312
  gr.HTML("""
313
  <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
314
+ <h4 style="color: #1e40af;">🎀 API Fixed - Now Uses Correct Methods:</h4>
315
+ <ul style="margin: 5px 0; padding-left: 20px;">
316
+ <li>βœ… Uses <code>model.tts()</code> method (correct)</li>
317
+ <li>❌ No longer tries <code>model.generate()</code> (doesn't exist)</li>
318
+ <li>πŸ”„ Fallback to <code>model.tts_to_file()</code> if needed</li>
319
+ </ul>
320
  </div>
321
  """)
322
 
 
340
  )
341
 
342
  voice_btn = gr.Button(
343
+ "🎀 Clone Voice (API Fixed)",
344
  variant="primary",
345
  size="lg"
346
  )
 
349
  voice_output = gr.Audio(label="Cloned Voice Result")
350
  voice_status = gr.Textbox(
351
  label="Status",
352
+ lines=8,
353
  interactive=False
354
  )
355
 
 
375
  )
376
 
377
  text_btn = gr.Button(
378
+ "πŸ“ Generate Speech (API Fixed)",
379
  variant="secondary",
380
  size="lg"
381
  )
 
384
  text_output = gr.Audio(label="Generated Speech")
385
  text_status = gr.Textbox(
386
  label="Status",
387
+ lines=8,
388
  interactive=False
389
  )
390
 
391
+ # Help section
392
+ with gr.Accordion("πŸ”§ API Fix Explanation", open=False):
393
+ gr.Markdown("""
394
+ ### βœ… What Was Fixed
395
+ **The Problem:** Your code was trying to call `model.generate()` which doesn't exist on XTTS models.
396
+
397
+ **The Solution:**
398
+ - **Primary Method:** `model.tts()` - Returns numpy array that we convert and save
399
+ - **Fallback Method:** `model.tts_to_file()` - Saves directly to file
400
+ - **Removed:** All calls to `model.generate()` (doesn't exist)
401
+
402
+ ### πŸ“‹ XTTS API Reference
403
+ ```
404
+ # βœ… CORRECT - What we now use:
405
+ wav = model.tts(text=text, speaker_wav=reference_audio, language=language)
406
+
407
+ # βœ… ALTERNATIVE - Also works:
408
+ model.tts_to_file(text=text, speaker_wav=reference_audio, language=language, file_path=output)
409
+
410
+ # ❌ WRONG - What was causing the error:
411
+ model.generate() # This method doesn't exist!
412
+ ```
413
+
414
+ ### πŸš€ Expected Results
415
+ - **No More API Errors:** `'GPT2InferenceModel' object has no attribute 'generate'` is fixed
416
+ - **Working Voice Cloning:** Real audio transformation using correct XTTS methods
417
+ - **Robust Fallbacks:** If primary method fails, tries alternative approach
418
+ """)
419
+
420
  # Event handlers
421
  voice_btn.click(
422
  fn=voice_to_voice_clone,