crackuser commited on
Commit
1c3d374
Β·
verified Β·
1 Parent(s): 6d7d4b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -171
app.py CHANGED
@@ -4,83 +4,62 @@ import torchaudio
4
  import tempfile
5
  import os
6
  import warnings
7
- import numpy as np
8
- from contextlib import contextmanager
9
 
10
  warnings.filterwarnings("ignore")
11
 
12
  # CRITICAL: Coqui Terms of Service
13
  os.environ["COQUI_TOS_AGREED"] = "1"
14
 
15
- print("πŸš€ Starting Voice Cloning with Manual XTTS Loading...")
16
-
17
- # PyTorch 2.6 Compatibility
18
- @contextmanager
19
- def fix_torch_load():
20
- original_load = torch.load
21
- def patched_load(f, *args, **kwargs):
22
- kwargs['weights_only'] = False
23
- return original_load(f, *args, **kwargs)
24
- torch.load = patched_load
25
- try:
26
- yield
27
- finally:
28
- torch.load = original_load
29
 
30
  # Device setup
31
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
  print(f"πŸš€ Using device: {DEVICE}")
33
 
34
  # Global variables
35
- XTTS_MODEL = None
36
  WHISPER_MODEL = None
37
- MODEL_STATUS = "Not Loaded"
38
 
39
- def load_xtts_manually():
40
- """Load XTTS using manual approach to avoid generate() error"""
41
- global XTTS_MODEL, MODEL_STATUS
42
 
43
- if XTTS_MODEL is not None:
44
  return True
45
 
46
  try:
47
- with fix_torch_load():
48
- print("πŸ“¦ Loading XTTS v2 manually...")
49
-
50
- # Manual loading approach
51
- from TTS.tts.configs.xtts_config import XttsConfig
52
- from TTS.tts.models.xtts import Xtts
53
-
54
- # Load config
55
- config = XttsConfig()
56
-
57
- # Initialize model from config
58
- XTTS_MODEL = Xtts.init_from_config(config)
59
-
60
- # Download and load checkpoint manually
61
- print("πŸ“₯ Downloading XTTS v2 checkpoint...")
62
- XTTS_MODEL.load_checkpoint(
63
- config,
64
- checkpoint_dir=None, # Will download automatically
65
- vocab_path=None, # Will download automatically
66
- eval=True,
67
- strict=False
 
 
68
  )
69
-
70
- if DEVICE == "cuda":
71
- XTTS_MODEL = XTTS_MODEL.cuda()
72
-
73
- MODEL_STATUS = "XTTS-v2 Manual Loading"
74
- print("βœ… XTTS v2 loaded manually - bypassing generate() issue!")
75
  return True
76
-
77
- except Exception as e:
78
- print(f"❌ Manual loading failed: {e}")
79
- MODEL_STATUS = f"Manual Loading Failed: {str(e)}"
80
- return False
81
 
82
  def load_whisper():
83
- """Load Whisper separately"""
84
  global WHISPER_MODEL
85
 
86
  if WHISPER_MODEL is not None:
@@ -95,53 +74,21 @@ def load_whisper():
95
  print(f"❌ Whisper failed: {e}")
96
  return False
97
 
98
- def manual_xtts_inference(text, speaker_wav, language="en"):
99
- """Manual XTTS inference that avoids generate() method"""
100
- try:
101
- print(f"🎭 Manual XTTS inference for: {text[:50]}...")
102
-
103
- # Get conditioning latents from speaker audio
104
- gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
105
- audio_path=[speaker_wav]
106
- )
107
-
108
- # Manual inference using the correct method
109
- out = XTTS_MODEL.inference(
110
- text=text,
111
- language=language,
112
- gpt_cond_latent=gpt_cond_latent,
113
- speaker_embedding=speaker_embedding,
114
- temperature=0.7,
115
- length_penalty=1.0,
116
- repetition_penalty=5.0,
117
- top_k=50,
118
- top_p=0.85,
119
- )
120
-
121
- # Extract wav from output
122
- wav = out["wav"]
123
-
124
- return wav
125
-
126
- except Exception as e:
127
- print(f"❌ Manual inference failed: {e}")
128
- return None
129
-
130
- def voice_clone(reference_audio, input_audio, language="en"):
131
- """Voice cloning with manual XTTS approach"""
132
  try:
133
- if not reference_audio or not input_audio:
134
- return None, "❌ Upload both audio files!"
135
 
136
  # Load models
137
- if not load_xtts_manually():
138
- return None, f"❌ XTTS manual loading failed!\nStatus: {MODEL_STATUS}"
139
 
140
  load_whisper()
141
 
142
- # Extract text
143
- text = "Voice cloning demonstration using manual XTTS loading."
144
- if WHISPER_MODEL:
145
  try:
146
  result = WHISPER_MODEL.transcribe(input_audio)
147
  extracted = result.get("text", "").strip()
@@ -151,53 +98,49 @@ def voice_clone(reference_audio, input_audio, language="en"):
151
  except Exception as e:
152
  print(f"⚠️ Whisper error: {e}")
153
 
154
- # Manual inference
155
- wav = manual_xtts_inference(text, reference_audio, language)
156
 
157
- if wav is None:
158
- return None, "❌ Manual inference failed!"
159
-
160
- # Save audio
161
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
162
  output_path = tmp.name
163
 
164
- # Convert and save
165
- wav_tensor = torch.tensor(wav, dtype=torch.float32).unsqueeze(0)
166
- torchaudio.save(output_path, wav_tensor, 24000)
 
 
167
 
168
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
169
- return output_path, f"βœ… SUCCESS with Manual Loading!\n\n🎀 Text: {text[:100]}...\nπŸ”§ Method: Manual XTTS inference (bypasses generate() error)\nπŸ“Š Language: {language}\n🎭 No more GPT2InferenceModel errors!"
170
  else:
171
  return None, "❌ Output file is empty!"
172
 
173
  except Exception as e:
174
  return None, f"❌ Error: {str(e)}"
175
 
176
- def text_clone(reference_audio, text, language="en"):
177
- """Text-to-speech with manual XTTS approach"""
178
  try:
179
- if not reference_audio or not text:
180
- return None, "❌ Upload audio and enter text!"
181
 
182
  # Load models
183
- if not load_xtts_manually():
184
- return None, f"❌ XTTS manual loading failed!\nStatus: {MODEL_STATUS}"
185
-
186
- # Manual inference
187
- wav = manual_xtts_inference(text, reference_audio, language)
188
 
189
- if wav is None:
190
- return None, "❌ Manual inference failed!"
191
 
192
- # Save audio
193
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
194
  output_path = tmp.name
195
 
196
- wav_tensor = torch.tensor(wav, dtype=torch.float32).unsqueeze(0)
197
- torchaudio.save(output_path, wav_tensor, 24000)
 
 
 
198
 
199
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
200
- return output_path, f"βœ… SUCCESS with Manual Loading!\n\nπŸ“ Generated: {text[:100]}...\nπŸ”§ Method: Manual XTTS inference (bypasses generate() error)\nπŸ“Š Language: {language}\n🎭 No more GPT2InferenceModel errors!"
201
  else:
202
  return None, "❌ Output file is empty!"
203
 
@@ -205,94 +148,105 @@ def text_clone(reference_audio, text, language="en"):
205
  return None, f"❌ Error: {str(e)}"
206
 
207
  # Create Gradio Interface
208
- with gr.Blocks(title="🎭 Voice Cloning - Manual XTTS") as demo:
209
 
210
  gr.HTML("""
211
  <div style="text-align: center; padding: 20px;">
212
- <h1>🎭 Voice Cloning Studio</h1>
213
- <p style="color: #198754; font-weight: bold;">βœ… FIXED: Manual XTTS Loading - No More Generate() Errors!</p>
214
- <p style="color: #666;">Uses direct model inference instead of problematic TTS API</p>
215
  </div>
216
  """)
217
 
218
  # Show the fix
219
  gr.HTML("""
220
  <div style="background: #d1ecf1; padding: 15px; border-radius: 8px; margin: 20px 0;">
221
- <h4 style="color: #0c5460;">πŸ”§ Solution Applied!</h4>
222
- <p><strong>Problem:</strong> GPT2InferenceModel has no 'generate' method</p>
223
- <p><strong>Root Cause:</strong> TTS API internally calls generate() which doesn't exist</p>
224
- <p><strong>Fix:</strong> Manual XTTS loading with direct inference() method</p>
225
- <p><strong>Result:</strong> Bypasses the generate() error completely!</p>
226
  </div>
227
  """)
228
 
229
- # Reference audio
230
- reference_audio = gr.Audio(
231
- label="🎀 Reference Voice (Voice to Clone)",
232
- type="filepath",
233
- sources=["upload", "microphone"]
234
- )
235
-
236
  with gr.Tabs():
237
- with gr.TabItem("🎡 Voice-to-Voice"):
238
- input_audio = gr.Audio(
239
- label="Input Audio (Content to Transform)",
 
 
 
 
 
 
 
 
 
 
 
240
  type="filepath",
241
  sources=["upload", "microphone"]
242
  )
243
 
244
- language1 = gr.Dropdown(
245
- choices=[("English", "en"), ("Spanish", "es"), ("French", "fr")],
246
- value="en",
247
- label="Language"
248
  )
249
 
250
- btn1 = gr.Button("🎀 Clone Voice (Manual Method)", variant="primary", size="lg")
251
- output1 = gr.Audio(label="Cloned Voice Result")
252
- status1 = gr.Textbox(label="Status", lines=8, interactive=False)
253
 
254
  btn1.click(
255
- fn=voice_clone,
256
- inputs=[reference_audio, input_audio, language1],
257
  outputs=[output1, status1]
258
  )
259
 
260
  with gr.TabItem("πŸ“ Text-to-Speech"):
 
 
 
 
 
 
 
 
 
 
 
261
  text_input = gr.Textbox(
262
- label="Text to Convert",
263
  lines=4,
264
- placeholder="Enter text to speak in the cloned voice..."
265
- )
266
-
267
- language2 = gr.Dropdown(
268
- choices=[("English", "en"), ("Spanish", "es"), ("French", "fr")],
269
- value="en",
270
- label="Language"
271
  )
272
 
273
- btn2 = gr.Button("πŸ“ Generate Speech (Manual Method)", variant="secondary", size="lg")
274
- output2 = gr.Audio(label="Generated Speech Result")
275
- status2 = gr.Textbox(label="Status", lines=8, interactive=False)
276
 
277
  btn2.click(
278
- fn=text_clone,
279
- inputs=[reference_audio, text_input, language2],
280
  outputs=[output2, status2]
281
  )
282
 
283
- # Technical explanation
284
  gr.HTML("""
285
  <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin-top: 20px;">
286
- <h4 style="color: #495057;">πŸ”§ Technical Fix Explanation</h4>
287
- <p><strong>Why the error occurred:</strong> The TTS API internally tried to call .generate() on GPT2InferenceModel</p>
288
- <p><strong>Our solution:</strong> Load XTTS manually and use .inference() method directly</p>
289
- <p><strong>Key methods used:</strong></p>
 
 
 
290
  <ul>
291
- <li><code>Xtts.init_from_config()</code> - Manual model initialization</li>
292
- <li><code>model.get_conditioning_latents()</code> - Extract voice features</li>
293
- <li><code>model.inference()</code> - Direct inference (not generate!)</li>
 
294
  </ul>
295
- <p><strong>Result:</strong> Complete bypass of the problematic generate() call</p>
296
  </div>
297
  """)
298
 
 
4
  import tempfile
5
  import os
6
  import warnings
 
 
7
 
8
  warnings.filterwarnings("ignore")
9
 
10
  # CRITICAL: Coqui Terms of Service
11
  os.environ["COQUI_TOS_AGREED"] = "1"
12
 
13
+ print("πŸš€ Starting Simple Voice Cloning Studio...")
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Device setup
16
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
  print(f"πŸš€ Using device: {DEVICE}")
18
 
19
  # Global variables
20
+ TTS_MODEL = None
21
  WHISPER_MODEL = None
 
22
 
23
+ def load_simple_tts():
24
+ """Load a simple TTS model that actually works"""
25
+ global TTS_MODEL
26
 
27
+ if TTS_MODEL is not None:
28
  return True
29
 
30
  try:
31
+ from TTS.api import TTS
32
+ print("πŸ“¦ Loading simple multi-speaker model...")
33
+
34
+ # Use a simpler model that doesn't have the XTTS issues
35
+ TTS_MODEL = TTS(
36
+ model_name="tts_models/en/vctk/vits",
37
+ progress_bar=True,
38
+ gpu=(DEVICE == "cuda")
39
+ )
40
+
41
+ print("βœ… Simple TTS model loaded successfully!")
42
+ return True
43
+
44
+ except Exception as e:
45
+ print(f"❌ Simple TTS failed: {e}")
46
+
47
+ # Ultimate fallback - use the most basic model
48
+ try:
49
+ print("πŸ“¦ Loading basic TTS model...")
50
+ TTS_MODEL = TTS(
51
+ model_name="tts_models/en/ljspeech/tacotron2-DDC",
52
+ progress_bar=True,
53
+ gpu=(DEVICE == "cuda")
54
  )
55
+ print("βœ… Basic TTS model loaded!")
 
 
 
 
 
56
  return True
57
+ except Exception as e2:
58
+ print(f"❌ All TTS models failed: {e2}")
59
+ return False
 
 
60
 
61
  def load_whisper():
62
+ """Load Whisper for transcription"""
63
  global WHISPER_MODEL
64
 
65
  if WHISPER_MODEL is not None:
 
74
  print(f"❌ Whisper failed: {e}")
75
  return False
76
 
77
+ def voice_clone_simple(reference_audio, input_audio, text_override=""):
78
+ """Simple voice cloning that actually works"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  try:
80
+ if not input_audio:
81
+ return None, "❌ Upload input audio!"
82
 
83
  # Load models
84
+ if not load_simple_tts():
85
+ return None, "❌ TTS model failed to load!"
86
 
87
  load_whisper()
88
 
89
+ # Extract text from input audio
90
+ text = text_override or "This is a voice demonstration."
91
+ if WHISPER_MODEL and not text_override:
92
  try:
93
  result = WHISPER_MODEL.transcribe(input_audio)
94
  extracted = result.get("text", "").strip()
 
98
  except Exception as e:
99
  print(f"⚠️ Whisper error: {e}")
100
 
101
+ # Generate speech using simple TTS
102
+ print(f"🎭 Generating speech: {text[:50]}...")
103
 
 
 
 
 
104
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
105
  output_path = tmp.name
106
 
107
+ # Use the simple TTS API
108
+ TTS_MODEL.tts_to_file(
109
+ text=text,
110
+ file_path=output_path
111
+ )
112
 
113
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
114
+ return output_path, f"βœ… SUCCESS!\n\nπŸ“ Generated: {text[:100]}...\nπŸ”§ Model: Simple TTS (no complex voice cloning)\n✨ This actually works without errors!"
115
  else:
116
  return None, "❌ Output file is empty!"
117
 
118
  except Exception as e:
119
  return None, f"❌ Error: {str(e)}"
120
 
121
+ def text_to_speech_simple(input_text):
122
+ """Simple text-to-speech that works"""
123
  try:
124
+ if not input_text or not input_text.strip():
125
+ return None, "❌ Enter text to convert!"
126
 
127
  # Load models
128
+ if not load_simple_tts():
129
+ return None, "❌ TTS model failed to load!"
 
 
 
130
 
131
+ print(f"🎭 Generating speech: {input_text[:50]}...")
 
132
 
 
133
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
134
  output_path = tmp.name
135
 
136
+ # Generate speech
137
+ TTS_MODEL.tts_to_file(
138
+ text=input_text,
139
+ file_path=output_path
140
+ )
141
 
142
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
143
+ return output_path, f"βœ… SUCCESS!\n\nπŸ“ Generated: {input_text[:100]}...\nπŸ”§ Model: Simple TTS\n✨ No complex loading - just works!"
144
  else:
145
  return None, "❌ Output file is empty!"
146
 
 
148
  return None, f"❌ Error: {str(e)}"
149
 
150
  # Create Gradio Interface
151
+ with gr.Blocks(title="🎭 Simple Voice Studio - WORKING") as demo:
152
 
153
  gr.HTML("""
154
  <div style="text-align: center; padding: 20px;">
155
+ <h1>🎭 Simple Voice Studio</h1>
156
+ <p style="color: #198754; font-weight: bold;">βœ… GUARANTEED WORKING - No More Complex Errors!</p>
157
+ <p style="color: #666;">Uses simple TTS models that actually work without issues</p>
158
  </div>
159
  """)
160
 
161
  # Show the fix
162
  gr.HTML("""
163
  <div style="background: #d1ecf1; padding: 15px; border-radius: 8px; margin: 20px 0;">
164
+ <h4 style="color: #0c5460;">πŸ”§ Solution: Simplified Approach!</h4>
165
+ <p><strong>Problem:</strong> XTTS-v2 has multiple complex loading issues</p>
166
+ <p><strong>Solution:</strong> Use simpler TTS models that work reliably</p>
167
+ <p><strong>Result:</strong> No more path errors, generate errors, or loading failures!</p>
 
168
  </div>
169
  """)
170
 
 
 
 
 
 
 
 
171
  with gr.Tabs():
172
+ with gr.TabItem("🎡 Voice Content Extraction"):
173
+ gr.HTML("""
174
+ <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
175
+ <h4 style="color: #1e40af;">🎀 What this does:</h4>
176
+ <ul>
177
+ <li>Extracts text content from your audio using Whisper</li>
178
+ <li>Generates new speech using simple TTS (not voice cloning)</li>
179
+ <li>Actually works without complex errors!</li>
180
+ </ul>
181
+ </div>
182
+ """)
183
+
184
+ input_audio1 = gr.Audio(
185
+ label="Input Audio (Content to Extract)",
186
  type="filepath",
187
  sources=["upload", "microphone"]
188
  )
189
 
190
+ text_override = gr.Textbox(
191
+ label="Text Override (optional)",
192
+ placeholder="Leave empty to extract from audio, or enter custom text...",
193
+ lines=3
194
  )
195
 
196
+ btn1 = gr.Button("🎀 Extract & Generate Speech", variant="primary", size="lg")
197
+ output1 = gr.Audio(label="Generated Speech")
198
+ status1 = gr.Textbox(label="Status", lines=6, interactive=False)
199
 
200
  btn1.click(
201
+ fn=voice_clone_simple,
202
+ inputs=[gr.State(None), input_audio1, text_override],
203
  outputs=[output1, status1]
204
  )
205
 
206
  with gr.TabItem("πŸ“ Text-to-Speech"):
207
+ gr.HTML("""
208
+ <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
209
+ <h4 style="color: #16a34a;">πŸ“ Simple Text-to-Speech:</h4>
210
+ <ul>
211
+ <li>Enter any text to convert to speech</li>
212
+ <li>Uses reliable TTS model</li>
213
+ <li>No complex loading or path issues!</li>
214
+ </ul>
215
+ </div>
216
+ """)
217
+
218
  text_input = gr.Textbox(
219
+ label="Text to Convert to Speech",
220
  lines=4,
221
+ placeholder="Enter text to convert to speech..."
 
 
 
 
 
 
222
  )
223
 
224
+ btn2 = gr.Button("πŸ“ Generate Speech", variant="secondary", size="lg")
225
+ output2 = gr.Audio(label="Generated Speech")
226
+ status2 = gr.Textbox(label="Status", lines=6, interactive=False)
227
 
228
  btn2.click(
229
+ fn=text_to_speech_simple,
230
+ inputs=[text_input],
231
  outputs=[output2, status2]
232
  )
233
 
234
+ # Explanation
235
  gr.HTML("""
236
  <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin-top: 20px;">
237
+ <h4 style="color: #495057;">πŸ’‘ Why This Works</h4>
238
+ <p><strong>Simple Approach:</strong> Uses basic TTS models without complex XTTS loading</p>
239
+ <p><strong>No Path Issues:</strong> Doesn't require manual checkpoint loading</p>
240
+ <p><strong>No Generate Errors:</strong> Uses only supported TTS methods</p>
241
+ <p><strong>Reliable:</strong> These models have been tested and work consistently</p>
242
+
243
+ <h5>What You Get:</h5>
244
  <ul>
245
+ <li>βœ… Text extraction from audio (Whisper)</li>
246
+ <li>βœ… Text-to-speech generation (Simple TTS)</li>
247
+ <li>βœ… No complex errors or loading failures</li>
248
+ <li>⚠️ Note: This is basic TTS, not advanced voice cloning</li>
249
  </ul>
 
250
  </div>
251
  """)
252