crackuser commited on
Commit
3ad5343
Β·
verified Β·
1 Parent(s): 71d678c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -69
app.py CHANGED
@@ -5,10 +5,13 @@ import tempfile
5
  import os
6
  import warnings
7
  from contextlib import contextmanager
 
 
 
8
 
9
  warnings.filterwarnings("ignore")
10
  os.environ["COQUI_TOS_AGREED"] = "1"
11
- print("πŸš€ Starting Voice Cloning Studio...")
12
 
13
  @contextmanager
14
  def patch_torch_load():
@@ -22,150 +25,276 @@ def patch_torch_load():
22
  finally:
23
  torch.load = original_load
24
 
 
25
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
26
  TTS_MODEL = None
27
  WHISPER_MODEL = None
28
  MODEL_STATUS = "Not Loaded"
 
29
 
30
- def load_xtts_manual():
 
31
  global TTS_MODEL, MODEL_STATUS
32
  if TTS_MODEL is not None:
33
  return True
34
  try:
35
  with patch_torch_load():
36
  from TTS.api import TTS
37
- print("πŸ“¦ Loading XTTS...")
 
38
  TTS_MODEL = TTS(
39
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
40
- progress_bar=True,
41
  gpu=(DEVICE == "cuda")
42
  )
43
- MODEL_STATUS = "XTTS-v2 Ready"
44
- print("βœ… XTTS loaded!")
 
 
 
 
 
 
 
45
  return True
46
  except Exception as e:
47
  print(f"❌ XTTS loading failed: {e}")
48
- MODEL_STATUS = f"Manual Failed: {str(e)}"
49
  return False
50
 
51
- def load_whisper():
 
52
  global WHISPER_MODEL
53
  if WHISPER_MODEL is not None:
54
  return True
55
  try:
56
  import whisper
57
- WHISPER_MODEL = whisper.load_model("base")
58
- print("βœ… Whisper loaded!")
 
59
  return True
60
  except Exception as e:
61
  print(f"❌ Whisper failed: {e}")
62
  return False
63
 
64
- def voice_to_voice_clone(reference_audio, input_audio, language="en"):
65
- """
66
- Main voice cloning function - this will be called by both UI and API
67
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  try:
69
- print(f"🎭 Voice cloning request: {language}")
70
- print(f"πŸ“ Reference: {reference_audio}")
71
- print(f"πŸ“ Input: {input_audio}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  if not reference_audio or not input_audio:
74
- return None, "❌ Please upload both reference and input audio files!"
75
 
76
- # Load XTTS model
77
- if not load_xtts_manual():
78
- return None, f"❌ XTTS loading failed!\nStatus: {MODEL_STATUS}"
 
79
 
80
- # Load Whisper for transcription
81
- load_whisper()
 
 
 
 
 
 
82
 
83
- # Extract text from input audio
84
- extracted_text = "Voice cloning demonstration."
85
  if WHISPER_MODEL:
86
  try:
87
- result = WHISPER_MODEL.transcribe(input_audio)
88
- text = result.get("text", "").strip()
89
- if text and len(text) > 3:
 
 
 
 
 
 
90
  extracted_text = text
91
- print(f"βœ… Extracted: '{extracted_text[:100]}...'")
92
  except Exception as e:
93
- print(f"⚠️ Whisper error: {e}")
94
 
95
- # Generate cloned voice
 
 
 
96
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
97
  output_path = tmp_file.name
98
 
99
- print(f"πŸ”„ Generating voice clone...")
100
- with patch_torch_load():
101
- TTS_MODEL.tts_to_file(
102
- text=extracted_text,
103
- speaker_wav=reference_audio,
104
- language=language,
105
- file_path=output_path
106
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # Verify output
109
- if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
110
- success_message = f"""βœ… VOICE-TO-VOICE CLONING SUCCESS!
111
- πŸ“ Content: '{extracted_text[:150]}...'
112
- 🎭 Device: {DEVICE}
113
  πŸ”§ Status: {MODEL_STATUS}
114
- πŸ“Š Output size: {os.path.getsize(output_path)} bytes
115
- """
116
- print("βœ… Voice cloning completed successfully!")
117
- return output_path, success_message
 
118
  else:
119
- return None, "❌ Generated audio file is empty!"
120
 
121
  except Exception as e:
122
- error_msg = f"❌ Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}"
123
  print(error_msg)
124
  return None, error_msg
125
 
126
- # FIXED: Use gr.Interface instead of gr.Blocks for proper API exposure
127
  interface = gr.Interface(
128
- fn=voice_to_voice_clone,
129
  inputs=[
130
  gr.Audio(
131
- label="🎀 Reference Audio (Voice to Clone)",
132
  type="filepath",
133
  sources=["upload"]
134
  ),
135
  gr.Audio(
136
- label="🎡 Input Audio (Content to Transform)",
137
  type="filepath",
138
  sources=["upload"]
139
  ),
140
  gr.Dropdown(
141
- choices=[
142
- "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
143
- "cs", "ar", "zh", "ja", "ko", "hi", "uk", "vi", "ro", "el",
144
- "he", "fi", "hu", "sv", "ca", "id", "ms", "bg", "sk", "da",
145
- "no", "lt", "hr", "sr", "sl", "et", "lv", "fil", "bn", "ta",
146
- "te", "ur", "fa", "th"
147
- ],
148
  value="en",
149
  label="🌍 Language"
150
  )
151
  ],
152
  outputs=[
153
- gr.Audio(label="πŸŽ‰ Cloned Voice Result"),
154
- gr.Textbox(label="πŸ“‹ Status", lines=8)
155
  ],
156
- title="🎭 REAL Voice Cloning Studio",
157
- description="Transform any voice into any other voice using XTTS-v2 and Whisper AI models. Upload reference audio and input audio to get started.",
158
  theme=gr.themes.Soft(),
159
  allow_flagging="never",
160
- api_name="voice_to_voice_clone" # CRITICAL: This creates the API endpoint
161
  )
162
 
163
  if __name__ == "__main__":
164
- print("🌐 Launching Voice Cloning Studio...")
165
- interface.launch(
 
 
 
 
166
  server_name="0.0.0.0",
167
  server_port=7860,
168
  share=False,
169
- show_api=True, # Shows API documentation
170
- debug=True
 
171
  )
 
5
  import os
6
  import warnings
7
  from contextlib import contextmanager
8
+ import asyncio
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ import gc
11
 
12
  warnings.filterwarnings("ignore")
13
  os.environ["COQUI_TOS_AGREED"] = "1"
14
+ print("πŸš€ Starting OPTIMIZED Voice Cloning Studio...")
15
 
16
  @contextmanager
17
  def patch_torch_load():
 
25
  finally:
26
  torch.load = original_load
27
 
28
+ # OPTIMIZATION 1: Hardware Detection and Setup
29
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
30
+ if DEVICE == "cuda":
31
+ torch.backends.cudnn.benchmark = True # Optimize for consistent input sizes
32
+ torch.backends.cuda.matmul.allow_tf32 = True # Enable TF32 for faster computation
33
+
34
+ print(f"πŸ”₯ Device: {DEVICE}")
35
+ if DEVICE == "cuda":
36
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
37
+ print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
38
+
39
  TTS_MODEL = None
40
  WHISPER_MODEL = None
41
  MODEL_STATUS = "Not Loaded"
42
+ SPEAKER_EMBEDDINGS_CACHE = {} # OPTIMIZATION 2: Cache embeddings
43
 
44
+ def load_xtts_optimized():
45
+ """Optimized XTTS loading with performance settings"""
46
  global TTS_MODEL, MODEL_STATUS
47
  if TTS_MODEL is not None:
48
  return True
49
  try:
50
  with patch_torch_load():
51
  from TTS.api import TTS
52
+ print("πŸ“¦ Loading XTTS with optimizations...")
53
+
54
  TTS_MODEL = TTS(
55
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
56
+ progress_bar=False, # Disable progress bar for speed
57
  gpu=(DEVICE == "cuda")
58
  )
59
+
60
+ # OPTIMIZATION 3: Model optimizations
61
+ if DEVICE == "cuda":
62
+ TTS_MODEL.tts.cuda()
63
+ # Enable mixed precision for faster inference
64
+ TTS_MODEL.tts.half() # Use FP16 for speed
65
+
66
+ MODEL_STATUS = "XTTS-v2 Optimized"
67
+ print("βœ… XTTS loaded with optimizations!")
68
  return True
69
  except Exception as e:
70
  print(f"❌ XTTS loading failed: {e}")
71
+ MODEL_STATUS = f"Failed: {str(e)}"
72
  return False
73
 
74
+ def load_whisper_optimized():
75
+ """Optimized Whisper loading"""
76
  global WHISPER_MODEL
77
  if WHISPER_MODEL is not None:
78
  return True
79
  try:
80
  import whisper
81
+ # Use smaller, faster model for transcription
82
+ WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
83
+ print("βœ… Whisper loaded (base model for speed)!")
84
  return True
85
  except Exception as e:
86
  print(f"❌ Whisper failed: {e}")
87
  return False
88
 
89
+ def get_cached_speaker_embeddings(reference_audio):
90
+ """OPTIMIZATION 4: Cache speaker embeddings to avoid recomputation"""
91
+ # Create cache key from file size and modification time
92
+ try:
93
+ stat = os.stat(reference_audio)
94
+ cache_key = f"{stat.st_size}_{stat.st_mtime}"
95
+
96
+ if cache_key in SPEAKER_EMBEDDINGS_CACHE:
97
+ print("πŸš€ Using cached speaker embeddings!")
98
+ return SPEAKER_EMBEDDINGS_CACHE[cache_key]
99
+
100
+ # Compute new embeddings
101
+ print("πŸ”„ Computing speaker embeddings...")
102
+ gpt_cond_latent, speaker_embedding = TTS_MODEL.tts.get_conditioning_latents(
103
+ audio_path=reference_audio,
104
+ gpt_cond_len=6, # Reduced from 30 for speed
105
+ max_ref_length=10 # Reduced from 60 for speed
106
+ )
107
+
108
+ # Cache the results
109
+ SPEAKER_EMBEDDINGS_CACHE[cache_key] = (gpt_cond_latent, speaker_embedding)
110
+ print("βœ… Speaker embeddings cached!")
111
+
112
+ # Limit cache size
113
+ if len(SPEAKER_EMBEDDINGS_CACHE) > 10:
114
+ oldest_key = list(SPEAKER_EMBEDDINGS_CACHE.keys())[0]
115
+ del SPEAKER_EMBEDDINGS_CACHE[oldest_key]
116
+
117
+ return gpt_cond_latent, speaker_embedding
118
+
119
+ except Exception as e:
120
+ print(f"⚠️ Embedding cache failed: {e}")
121
+ return None, None
122
+
123
+ def optimize_audio_input(audio_path, max_duration=10):
124
+ """OPTIMIZATION 5: Limit audio length for faster processing"""
125
  try:
126
+ import librosa
127
+ audio, sr = librosa.load(audio_path, sr=22050) # Standard rate for XTTS
128
+
129
+ # Limit duration for speed
130
+ max_samples = int(max_duration * sr)
131
+ if len(audio) > max_samples:
132
+ audio = audio[:max_samples]
133
+ print(f"πŸ”„ Audio trimmed to {max_duration}s for speed")
134
+
135
+ # Save optimized audio
136
+ optimized_path = audio_path.replace('.wav', '_opt.wav')
137
+ import soundfile as sf
138
+ sf.write(optimized_path, audio, sr)
139
+ return optimized_path
140
+
141
+ except Exception as e:
142
+ print(f"⚠️ Audio optimization failed: {e}")
143
+ return audio_path
144
+
145
+ def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
146
+ """OPTIMIZED voice cloning with performance improvements"""
147
+ try:
148
+ start_time = torch.cuda.Event(enable_timing=True) if DEVICE == "cuda" else None
149
+ end_time = torch.cuda.Event(enable_timing=True) if DEVICE == "cuda" else None
150
+
151
+ if start_time:
152
+ start_time.record()
153
+
154
+ print(f"🎭 OPTIMIZED Voice cloning: {language}")
155
 
156
  if not reference_audio or not input_audio:
157
+ return None, "❌ Upload both audio files!"
158
 
159
+ # Load models
160
+ if not load_xtts_optimized():
161
+ return None, f"❌ XTTS failed: {MODEL_STATUS}"
162
+ load_whisper_optimized()
163
 
164
+ # OPTIMIZATION 6: Parallel processing where possible
165
+ with ThreadPoolExecutor(max_workers=2) as executor:
166
+ # Optimize input audios in parallel
167
+ future_ref = executor.submit(optimize_audio_input, reference_audio)
168
+ future_input = executor.submit(optimize_audio_input, input_audio)
169
+
170
+ ref_optimized = future_ref.result()
171
+ input_optimized = future_input.result()
172
 
173
+ # OPTIMIZATION 7: Fast transcription with limits
174
+ extracted_text = "Voice cloning demo text."
175
  if WHISPER_MODEL:
176
  try:
177
+ # Limit transcription time
178
+ with torch.no_grad():
179
+ result = WHISPER_MODEL.transcribe(
180
+ input_optimized,
181
+ fp16=(DEVICE == "cuda"), # Use FP16 if available
182
+ language=language if language != 'auto' else None
183
+ )
184
+ text = result.get("text", "").strip()[:200] # Limit text length
185
+ if text and len(text) > 10:
186
  extracted_text = text
187
+ print(f"βœ… Fast transcription: '{extracted_text[:50]}...'")
188
  except Exception as e:
189
+ print(f"⚠️ Transcription error: {e}")
190
 
191
+ # OPTIMIZATION 8: Use cached embeddings
192
+ gpt_cond_latent, speaker_embedding = get_cached_speaker_embeddings(ref_optimized)
193
+
194
+ # Generate output
195
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
196
  output_path = tmp_file.name
197
 
198
+ print("πŸš€ Generating optimized voice clone...")
199
+
200
+ with patch_torch_load(), torch.no_grad(): # Disable gradient computation
201
+ if gpt_cond_latent is not None and speaker_embedding is not None:
202
+ # Use cached embeddings for faster inference
203
+ TTS_MODEL.tts.tts_to_file(
204
+ text=extracted_text,
205
+ file_path=output_path,
206
+ gpt_cond_latent=gpt_cond_latent,
207
+ speaker_embedding=speaker_embedding,
208
+ language=language,
209
+ temperature=0.7, # Lower temperature for faster, more stable output
210
+ length_penalty=1.0,
211
+ repetition_penalty=5.0,
212
+ top_k=50, # Limit choices for speed
213
+ top_p=0.85
214
+ )
215
+ else:
216
+ # Fallback to standard method
217
+ TTS_MODEL.tts_to_file(
218
+ text=extracted_text,
219
+ speaker_wav=ref_optimized,
220
+ language=language,
221
+ file_path=output_path,
222
+ temperature=0.7
223
+ )
224
+
225
+ # OPTIMIZATION 9: Memory cleanup
226
+ if DEVICE == "cuda":
227
+ torch.cuda.empty_cache()
228
+ gc.collect()
229
+
230
+ # Calculate timing
231
+ processing_time = "N/A"
232
+ if start_time and end_time:
233
+ end_time.record()
234
+ torch.cuda.synchronize()
235
+ processing_time = f"{start_time.elapsed_time(end_time)/1000:.1f}s"
236
 
237
  # Verify output
238
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
239
+ success_msg = f"""βœ… OPTIMIZED CLONING SUCCESS! ⚑
240
+ πŸ“ Text: '{extracted_text[:100]}...'
241
+ 🎭 Device: {DEVICE} | Time: {processing_time}
242
  πŸ”§ Status: {MODEL_STATUS}
243
+ πŸ“Š Size: {os.path.getsize(output_path)/1024:.1f} KB
244
+ πŸš€ Optimizations: Cached embeddings, FP16, Limited audio"""
245
+
246
+ print("βœ… Optimized voice cloning completed!")
247
+ return output_path, success_msg
248
  else:
249
+ return None, "❌ Output file empty or too small!"
250
 
251
  except Exception as e:
252
+ error_msg = f"❌ Optimized cloning error: {str(e)}"
253
  print(error_msg)
254
  return None, error_msg
255
 
256
+ # OPTIMIZATION 10: Gradio with performance settings
257
  interface = gr.Interface(
258
+ fn=voice_to_voice_clone_optimized,
259
  inputs=[
260
  gr.Audio(
261
+ label="🎀 Reference Audio (Voice to Clone - Max 10s recommended)",
262
  type="filepath",
263
  sources=["upload"]
264
  ),
265
  gr.Audio(
266
+ label="🎡 Input Audio (Content - Max 10s for speed)",
267
  type="filepath",
268
  sources=["upload"]
269
  ),
270
  gr.Dropdown(
271
+ choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
 
 
 
 
 
 
272
  value="en",
273
  label="🌍 Language"
274
  )
275
  ],
276
  outputs=[
277
+ gr.Audio(label="πŸŽ‰ Optimized Cloned Voice"),
278
+ gr.Textbox(label="πŸ“Š Performance Stats", lines=10)
279
  ],
280
+ title="πŸš€ HIGH-SPEED Voice Cloning Studio",
281
+ description="⚑ Optimized XTTS-v2 with caching, FP16, and performance tuning. Use 5-10 second audio clips for fastest results!",
282
  theme=gr.themes.Soft(),
283
  allow_flagging="never",
284
+ api_name="voice_to_voice_clone"
285
  )
286
 
287
  if __name__ == "__main__":
288
+ print("🌐 Launching OPTIMIZED Voice Cloning Studio...")
289
+ # OPTIMIZATION 11: Enable queue for better concurrency
290
+ interface.queue(
291
+ max_size=10, # Limit queue size
292
+ api_open=True
293
+ ).launch(
294
  server_name="0.0.0.0",
295
  server_port=7860,
296
  share=False,
297
+ show_api=True,
298
+ debug=False, # Disable debug for speed
299
+ enable_queue=True
300
  )