crackuser commited on
Commit
75fb8ef
Β·
verified Β·
1 Parent(s): 4904fc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -274
app.py CHANGED
@@ -5,279 +5,110 @@ import tempfile
5
  import os
6
  import warnings
7
  from contextlib import contextmanager
8
- import time
9
 
10
  warnings.filterwarnings("ignore")
11
-
12
- # CRITICAL: Coqui Terms of Service
13
  os.environ["COQUI_TOS_AGREED"] = "1"
14
 
15
- print("πŸš€ Starting OPTIMIZED Voice Cloning Studio...")
16
 
17
- # PyTorch Optimizations
18
  @contextmanager
19
- def optimized_torch():
20
- """Apply PyTorch optimizations for speed"""
21
  original_load = torch.load
22
-
23
- def fast_load(f, *args, **kwargs):
24
  kwargs['weights_only'] = False
25
- kwargs['map_location'] = 'cuda' if torch.cuda.is_available() else 'cpu'
26
  return original_load(f, *args, **kwargs)
27
-
28
- torch.load = fast_load
29
-
30
- # Enable optimizations
31
- if torch.cuda.is_available():
32
- torch.backends.cudnn.benchmark = True
33
- torch.backends.cuda.matmul.allow_tf32 = True
34
- torch.backends.cudnn.allow_tf32 = True
35
-
36
  try:
37
  yield
38
  finally:
39
  torch.load = original_load
40
 
41
- # Device setup with optimization
42
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
43
- print(f"πŸš€ Using device: {DEVICE}")
44
-
45
- if DEVICE == "cuda":
46
- print(f"βœ… GPU: {torch.cuda.get_device_name()}")
47
- print(f"βœ… VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
48
- else:
49
- print("⚠ WARNING: Using CPU - expect VERY slow processing (10+ minutes)")
50
-
51
- # Global models (kept in memory for speed)
52
  TTS_MODEL = None
53
  WHISPER_MODEL = None
54
- SPEAKER_EMBEDDINGS_CACHE = {}
55
 
56
- def load_optimized_models():
57
- """Load models with speed optimizations"""
58
- global TTS_MODEL, WHISPER_MODEL
59
-
60
- if TTS_MODEL is not None and WHISPER_MODEL is not None:
61
  return True
62
-
63
- start_time = time.time()
64
- print("πŸ”„ Loading OPTIMIZED models...")
65
-
66
- # Load XTTS with optimizations
67
- if TTS_MODEL is None:
68
- try:
69
- with optimized_torch():
70
- from TTS.api import TTS
71
- print("πŸ“¦ Loading XTTS with optimizations...")
72
-
73
- TTS_MODEL = TTS(
74
- model_name="tts_models/multilingual/multi-dataset/xtts_v2",
75
- progress_bar=True,
76
- gpu=(DEVICE == "cuda")
77
- )
78
-
79
- # Apply model optimizations
80
- if DEVICE == "cuda":
81
- TTS_MODEL.synthesizer.tts_model.half() # Use FP16 for speed
82
- TTS_MODEL.synthesizer.tts_model.eval() # Evaluation mode
83
-
84
- print("βœ… XTTS loaded with optimizations!")
85
-
86
- except Exception as e:
87
- print(f"❌ XTTS loading failed: {e}")
88
- return False
89
-
90
- # Load Whisper with optimizations
91
- if WHISPER_MODEL is None:
92
- try:
93
- import whisper
94
- print("πŸ“¦ Loading optimized Whisper...")
95
- WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
96
- print("βœ… Whisper loaded!")
97
- except Exception as e:
98
- print(f"❌ Whisper failed: {e}")
99
- return False
100
-
101
- load_time = time.time() - start_time
102
- print(f"βœ… Models loaded in {load_time:.1f} seconds")
103
- return True
104
 
105
- def get_speaker_embedding(reference_audio):
106
- """Cache speaker embeddings for faster repeated use"""
107
- audio_hash = str(hash(reference_audio))
108
-
109
- if audio_hash in SPEAKER_EMBEDDINGS_CACHE:
110
- print("βœ… Using cached speaker embedding (faster!)")
111
- return SPEAKER_EMBEDDINGS_CACHE[audio_hash]
112
-
113
  try:
114
- print("🎭 Computing speaker embedding...")
115
-
116
- # Get conditioning latents for voice cloning
117
- gpt_cond_latent, speaker_embedding = TTS_MODEL.synthesizer.tts_model.get_conditioning_latents(
118
- audio_path=[reference_audio],
119
- gpt_cond_len=TTS_MODEL.synthesizer.tts_config.gpt_cond_len,
120
- max_ref_length=TTS_MODEL.synthesizer.tts_config.max_ref_len
121
- )
122
-
123
- # Cache for future use
124
- embeddings = (gpt_cond_latent, speaker_embedding)
125
- SPEAKER_EMBEDDINGS_CACHE[audio_hash] = embeddings
126
-
127
- return embeddings
128
-
129
  except Exception as e:
130
- print(f"❌ Speaker embedding failed: {e}")
131
- return None, None
132
 
133
- def fast_voice_clone(reference_audio, input_audio, language="en"):
134
- """OPTIMIZED voice cloning for faster processing"""
135
-
136
- start_total = time.time()
137
-
138
  try:
139
  if not reference_audio or not input_audio:
140
- return None, "❌ Please upload both audio files!"
141
-
142
- print("🎀 Starting OPTIMIZED Voice Cloning...")
143
-
144
- # Step 1: Load models (only once)
145
- if not load_optimized_models():
146
- return None, "❌ Model loading failed!"
147
-
148
- step1_time = time.time()
149
-
150
- # Step 2: Extract text (optimized)
151
- print("πŸ“ Extracting text with optimized Whisper...")
152
  extracted_text = "Voice cloning demonstration."
153
-
154
  if WHISPER_MODEL:
155
  try:
156
- result = WHISPER_MODEL.transcribe(
157
- input_audio,
158
- fp16=(DEVICE == "cuda"), # Use FP16 on GPU for speed
159
- language=language if language != "auto" else None
160
- )
161
  text = result.get("text", "").strip()
162
  if text and len(text) > 3:
163
- # Truncate very long text for faster processing
164
- extracted_text = text[:500] + ("..." if len(text) > 500 else "")
165
-
166
  print(f"βœ… Extracted: '{extracted_text[:100]}...'")
167
  except Exception as e:
168
- print(f"⚠ Whisper error: {e}")
169
-
170
- step2_time = time.time()
171
-
172
- # Step 3: Get speaker embeddings (cached)
173
- print("🎭 Getting speaker embeddings...")
174
- gpt_cond_latent, speaker_embedding = get_speaker_embedding(reference_audio)
175
-
176
- if gpt_cond_latent is None:
177
- return None, "❌ Speaker embedding extraction failed!"
178
-
179
- step3_time = time.time()
180
-
181
- # Step 4: Generate speech (optimized)
182
- print("🎡 Generating speech with optimizations...")
183
-
184
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
185
  output_path = tmp_file.name
186
-
187
- # Use optimized inference
188
- with optimized_torch():
189
- wav = TTS_MODEL.synthesizer.tts_model.inference(
190
  text=extracted_text,
 
191
  language=language,
192
- gpt_cond_latent=gpt_cond_latent,
193
- speaker_embedding=speaker_embedding,
194
- temperature=0.7, # Balanced quality/speed
195
- length_penalty=1.0,
196
- repetition_penalty=5.0,
197
- top_k=50,
198
- top_p=0.85,
199
- speed=1.0
200
  )
201
-
202
- # Save audio
203
- wav_tensor = torch.tensor(wav["wav"], dtype=torch.float32).unsqueeze(0)
204
- torchaudio.save(output_path, wav_tensor, 24000)
205
-
206
- step4_time = time.time()
207
-
208
- # Calculate timing breakdown
209
- total_time = step4_time - start_total
210
- transcribe_time = step2_time - step1_time
211
- embedding_time = step3_time - step2_time
212
- synthesis_time = step4_time - step3_time
213
-
214
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
215
- return output_path, f"""βœ… OPTIMIZED VOICE CLONING SUCCESS!
216
-
217
- πŸš€ *Speed Optimizations Applied:*
218
- β€’ Mixed precision (FP16) inference
219
- β€’ Cached speaker embeddings
220
- β€’ Optimized model loading
221
- β€’ GPU acceleration enabled
222
-
223
- ⏱ *Timing Breakdown:*
224
- β€’ Total time: {total_time:.1f}s (vs previous 744s!)
225
- β€’ Text extraction: {transcribe_time:.1f}s
226
- β€’ Speaker embedding: {embedding_time:.1f}s
227
- β€’ Voice synthesis: {synthesis_time:.1f}s
228
 
229
- πŸ“ *Content:* '{extracted_text[:150]}...'
230
- 🎭 *Device:* {DEVICE}
231
- πŸ”§ *Status:* Much faster processing achieved!"""
 
232
  else:
233
  return None, "❌ Generated audio file is empty!"
234
-
235
  except Exception as e:
236
- return None, f"❌ Optimized cloning error: {str(e)}"
237
 
238
- # Pre-load models at startup
239
- print("πŸ”„ Pre-loading models for faster inference...")
240
- startup_success = load_optimized_models()
241
-
242
- # Create Gradio Interface
243
- with gr.Blocks(title="πŸš€ OPTIMIZED Voice Cloning - Much Faster!") as demo:
244
-
245
  gr.HTML("""
246
  <div style="text-align: center; padding: 25px;">
247
- <h1 style="color: #2E86AB;">πŸš€ OPTIMIZED Voice Cloning Studio</h1>
248
- <p style="color: #198754; font-size: 1.2em; font-weight: bold;">⚑ SPEED OPTIMIZED - 10x+ Faster Processing!</p>
249
- <p style="color: #666;">From 744+ seconds β†’ 30-60 seconds on GPU</p>
250
- </div>
251
- """)
252
-
253
- # Speed optimization info
254
- gr.HTML(f"""
255
- <div style="padding: 20px; background: {'#d4edda' if DEVICE == 'cuda' else '#fff3cd'}; border-radius: 10px; margin-bottom: 25px;">
256
- <h4 style="color: {'#155724' if DEVICE == 'cuda' else '#856404'};">⚑ Speed Optimizations Active:</h4>
257
- <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
258
- <div>
259
- <h5>πŸ”§ Applied Optimizations:</h5>
260
- <ul>
261
- <li><strong>Device:</strong> {DEVICE.upper()}</li>
262
- <li><strong>Mixed Precision:</strong> {'βœ… FP16 Enabled' if DEVICE == 'cuda' else '❌ CPU Only'}</li>
263
- <li><strong>Model Caching:</strong> βœ… Enabled</li>
264
- <li><strong>Speaker Embeddings:</strong> βœ… Cached</li>
265
- </ul>
266
- </div>
267
- <div>
268
- <h5>⏱ Expected Processing Times:</h5>
269
- <ul>
270
- <li><strong>GPU (RTX 3060+):</strong> 20-60 seconds</li>
271
- <li><strong>GPU (GTX 1060):</strong> 60-120 seconds</li>
272
- <li><strong>CPU:</strong> 300-600 seconds</li>
273
- <li><strong>Previous:</strong> <span style="color: red;">744+ seconds</span></li>
274
- </ul>
275
- </div>
276
- </div>
277
  </div>
278
  """)
279
-
280
- # Main interface
281
  with gr.Row():
282
  with gr.Column():
283
  reference_audio = gr.Audio(
@@ -285,72 +116,36 @@ with gr.Blocks(title="πŸš€ OPTIMIZED Voice Cloning - Much Faster!") as demo:
285
  type="filepath",
286
  sources=["upload", "microphone"]
287
  )
288
-
289
  input_audio = gr.Audio(
290
  label="🎡 Input Audio (Content to Transform)",
291
  type="filepath",
292
  sources=["upload", "microphone"]
293
  )
294
-
295
  language = gr.Dropdown(
296
  choices=[
297
- ("πŸ‡ΊπŸ‡Έ English", "en"),
298
- ("πŸ‡ͺπŸ‡Έ Spanish", "es"),
299
- ("πŸ‡«πŸ‡· French", "fr"),
300
- ("πŸ‡©πŸ‡ͺ German", "de")
301
  ],
302
  value="en",
303
  label="Language"
304
  )
305
-
306
- clone_btn = gr.Button(
307
- "πŸš€ OPTIMIZED Voice Clone (Much Faster!)",
308
- variant="primary",
309
- size="lg"
310
- )
311
-
312
  with gr.Column():
313
- output_audio = gr.Audio(label="⚑ Fast Cloned Voice Result")
314
  status_output = gr.Textbox(
315
- label="Speed & Processing Status",
316
- lines=15,
317
  interactive=False
318
  )
319
-
320
- # Speed tips
321
- gr.HTML("""
322
- <div style="padding: 20px; background: #f8f9fa; border-radius: 10px; margin-top: 20px;">
323
- <h4 style="color: #495057;">πŸš€ Speed Optimization Tips:</h4>
324
- <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
325
- <div>
326
- <h5>⚑ For Faster Processing:</h5>
327
- <ul>
328
- <li>Use <strong>shorter audio clips</strong> (10-30 seconds)</li>
329
- <li>Keep <strong>text under 500 characters</strong></li>
330
- <li><strong>Reuse reference audio</strong> (embeddings cached)</li>
331
- <li>Use <strong>clear, single-speaker audio</strong></li>
332
- </ul>
333
- </div>
334
- <div>
335
- <h5>🎯 Expected Results:</h5>
336
- <ul>
337
- <li><strong>GPU:</strong> 90%+ speed improvement</li>
338
- <li><strong>CPU:</strong> 50-70% speed improvement</li>
339
- <li><strong>Quality:</strong> Same high quality output</li>
340
- <li><strong>Memory:</strong> More efficient usage</li>
341
- </ul>
342
- </div>
343
- </div>
344
- </div>
345
- """)
346
-
347
- # Event handler
348
  clone_btn.click(
349
- fn=fast_voice_clone,
350
  inputs=[reference_audio, input_audio, language],
351
  outputs=[output_audio, status_output],
352
  show_progress=True
353
  )
354
 
355
- if _name_ == "_main_":
356
- demo.launch()
 
5
  import os
6
  import warnings
7
  from contextlib import contextmanager
 
8
 
9
  warnings.filterwarnings("ignore")
 
 
10
  os.environ["COQUI_TOS_AGREED"] = "1"
11
 
12
+ print("πŸš€ Starting Voice Cloning Studio...")
13
 
 
14
  @contextmanager
15
+ def patch_torch_load():
 
16
  original_load = torch.load
17
+ def patched_load(f, *args, **kwargs):
 
18
  kwargs['weights_only'] = False
 
19
  return original_load(f, *args, **kwargs)
20
+ torch.load = patched_load
 
 
 
 
 
 
 
 
21
  try:
22
  yield
23
  finally:
24
  torch.load = original_load
25
 
 
26
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
27
  TTS_MODEL = None
28
  WHISPER_MODEL = None
29
+ MODEL_STATUS = "Not Loaded"
30
 
31
+ def load_xtts_manual():
32
+ global TTS_MODEL, MODEL_STATUS
33
+ if TTS_MODEL is not None:
 
 
34
  return True
35
+ try:
36
+ with patch_torch_load():
37
+ from TTS.api import TTS
38
+ print("πŸ“¦ Loading XTTS...")
39
+ TTS_MODEL = TTS(
40
+ model_name="tts_models/multilingual/multi-dataset/xtts_v2",
41
+ progress_bar=True,
42
+ gpu=(DEVICE == "cuda")
43
+ )
44
+ MODEL_STATUS = "XTTS-v2 Ready"
45
+ print("βœ… XTTS loaded!")
46
+ return True
47
+ except Exception as e:
48
+ print(f"❌ XTTS loading failed: {e}")
49
+ MODEL_STATUS = f"Manual Failed: {str(e)}"
50
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ def load_whisper():
53
+ global WHISPER_MODEL
54
+ if WHISPER_MODEL is not None:
55
+ return True
 
 
 
 
56
  try:
57
+ import whisper
58
+ WHISPER_MODEL = whisper.load_model("base")
59
+ print("βœ… Whisper loaded!")
60
+ return True
 
 
 
 
 
 
 
 
 
 
 
61
  except Exception as e:
62
+ print(f"❌ Whisper failed: {e}")
63
+ return False
64
 
65
+ def voice_to_voice_clone(reference_audio, input_audio, language="en"):
 
 
 
 
66
  try:
67
  if not reference_audio or not input_audio:
68
+ return None, "❌ Please upload both reference and input audio files!"
69
+ if not load_xtts_manual():
70
+ return None, f"❌ XTTS loading failed!\nStatus: {MODEL_STATUS}"
71
+ load_whisper()
 
 
 
 
 
 
 
 
72
  extracted_text = "Voice cloning demonstration."
 
73
  if WHISPER_MODEL:
74
  try:
75
+ result = WHISPER_MODEL.transcribe(input_audio)
 
 
 
 
76
  text = result.get("text", "").strip()
77
  if text and len(text) > 3:
78
+ extracted_text = text
 
 
79
  print(f"βœ… Extracted: '{extracted_text[:100]}...'")
80
  except Exception as e:
81
+ print(f"⚠️ Whisper error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
83
  output_path = tmp_file.name
84
+ with patch_torch_load():
85
+ TTS_MODEL.tts_to_file(
 
 
86
  text=extracted_text,
87
+ speaker_wav=reference_audio,
88
  language=language,
89
+ file_path=output_path
 
 
 
 
 
 
 
90
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
92
+ return output_path, f"""βœ… VOICE-TO-VOICE CLONING SUCCESS!
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ πŸ“ Content: '{extracted_text[:150]}...'
95
+ 🎭 Device: {DEVICE}
96
+ πŸ”§ Status: {MODEL_STATUS}
97
+ """
98
  else:
99
  return None, "❌ Generated audio file is empty!"
 
100
  except Exception as e:
101
+ return None, f"❌ Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}"
102
 
103
+ # Gradio Interface
104
+ with gr.Blocks(title="Voice Cloning Studio") as demo:
 
 
 
 
 
105
  gr.HTML("""
106
  <div style="text-align: center; padding: 25px;">
107
+ <h1>🎭 REAL Voice Cloning Studio</h1>
108
+ <p>Status: Models load on first use</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  </div>
110
  """)
111
+
 
112
  with gr.Row():
113
  with gr.Column():
114
  reference_audio = gr.Audio(
 
116
  type="filepath",
117
  sources=["upload", "microphone"]
118
  )
 
119
  input_audio = gr.Audio(
120
  label="🎡 Input Audio (Content to Transform)",
121
  type="filepath",
122
  sources=["upload", "microphone"]
123
  )
 
124
  language = gr.Dropdown(
125
  choices=[
126
+ ("English", "en"),
127
+ ("Spanish", "es"),
128
+ ("French", "fr"),
129
+ ("German", "de")
130
  ],
131
  value="en",
132
  label="Language"
133
  )
134
+ clone_btn = gr.Button("Clone Voice", variant="primary", size="lg")
 
 
 
 
 
 
135
  with gr.Column():
136
+ output_audio = gr.Audio(label="Cloned Voice Result")
137
  status_output = gr.Textbox(
138
+ label="Status",
139
+ lines=12,
140
  interactive=False
141
  )
142
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  clone_btn.click(
144
+ fn=voice_to_voice_clone,
145
  inputs=[reference_audio, input_audio, language],
146
  outputs=[output_audio, status_output],
147
  show_progress=True
148
  )
149
 
150
+ if __name__ == "__main__":
151
+ demo.launch()