crackuser commited on
Commit
3e9e2ab
Β·
verified Β·
1 Parent(s): 3ad5343

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -125
app.py CHANGED
@@ -5,8 +5,6 @@ import tempfile
5
  import os
6
  import warnings
7
  from contextlib import contextmanager
8
- import asyncio
9
- from concurrent.futures import ThreadPoolExecutor
10
  import gc
11
 
12
  warnings.filterwarnings("ignore")
@@ -28,21 +26,17 @@ def patch_torch_load():
28
  # OPTIMIZATION 1: Hardware Detection and Setup
29
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
30
  if DEVICE == "cuda":
31
- torch.backends.cudnn.benchmark = True # Optimize for consistent input sizes
32
- torch.backends.cuda.matmul.allow_tf32 = True # Enable TF32 for faster computation
33
 
34
  print(f"πŸ”₯ Device: {DEVICE}")
35
- if DEVICE == "cuda":
36
- print(f"GPU: {torch.cuda.get_device_name(0)}")
37
- print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
38
 
39
  TTS_MODEL = None
40
  WHISPER_MODEL = None
41
  MODEL_STATUS = "Not Loaded"
42
- SPEAKER_EMBEDDINGS_CACHE = {} # OPTIMIZATION 2: Cache embeddings
43
 
44
  def load_xtts_optimized():
45
- """Optimized XTTS loading with performance settings"""
46
  global TTS_MODEL, MODEL_STATUS
47
  if TTS_MODEL is not None:
48
  return True
@@ -53,16 +47,10 @@ def load_xtts_optimized():
53
 
54
  TTS_MODEL = TTS(
55
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
56
- progress_bar=False, # Disable progress bar for speed
57
  gpu=(DEVICE == "cuda")
58
  )
59
 
60
- # OPTIMIZATION 3: Model optimizations
61
- if DEVICE == "cuda":
62
- TTS_MODEL.tts.cuda()
63
- # Enable mixed precision for faster inference
64
- TTS_MODEL.tts.half() # Use FP16 for speed
65
-
66
  MODEL_STATUS = "XTTS-v2 Optimized"
67
  print("βœ… XTTS loaded with optimizations!")
68
  return True
@@ -72,13 +60,11 @@ def load_xtts_optimized():
72
  return False
73
 
74
  def load_whisper_optimized():
75
- """Optimized Whisper loading"""
76
  global WHISPER_MODEL
77
  if WHISPER_MODEL is not None:
78
  return True
79
  try:
80
  import whisper
81
- # Use smaller, faster model for transcription
82
  WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
83
  print("βœ… Whisper loaded (base model for speed)!")
84
  return True
@@ -86,45 +72,13 @@ def load_whisper_optimized():
86
  print(f"❌ Whisper failed: {e}")
87
  return False
88
 
89
- def get_cached_speaker_embeddings(reference_audio):
90
- """OPTIMIZATION 4: Cache speaker embeddings to avoid recomputation"""
91
- # Create cache key from file size and modification time
92
- try:
93
- stat = os.stat(reference_audio)
94
- cache_key = f"{stat.st_size}_{stat.st_mtime}"
95
-
96
- if cache_key in SPEAKER_EMBEDDINGS_CACHE:
97
- print("πŸš€ Using cached speaker embeddings!")
98
- return SPEAKER_EMBEDDINGS_CACHE[cache_key]
99
-
100
- # Compute new embeddings
101
- print("πŸ”„ Computing speaker embeddings...")
102
- gpt_cond_latent, speaker_embedding = TTS_MODEL.tts.get_conditioning_latents(
103
- audio_path=reference_audio,
104
- gpt_cond_len=6, # Reduced from 30 for speed
105
- max_ref_length=10 # Reduced from 60 for speed
106
- )
107
-
108
- # Cache the results
109
- SPEAKER_EMBEDDINGS_CACHE[cache_key] = (gpt_cond_latent, speaker_embedding)
110
- print("βœ… Speaker embeddings cached!")
111
-
112
- # Limit cache size
113
- if len(SPEAKER_EMBEDDINGS_CACHE) > 10:
114
- oldest_key = list(SPEAKER_EMBEDDINGS_CACHE.keys())[0]
115
- del SPEAKER_EMBEDDINGS_CACHE[oldest_key]
116
-
117
- return gpt_cond_latent, speaker_embedding
118
-
119
- except Exception as e:
120
- print(f"⚠️ Embedding cache failed: {e}")
121
- return None, None
122
-
123
- def optimize_audio_input(audio_path, max_duration=10):
124
- """OPTIMIZATION 5: Limit audio length for faster processing"""
125
  try:
126
  import librosa
127
- audio, sr = librosa.load(audio_path, sr=22050) # Standard rate for XTTS
 
 
128
 
129
  # Limit duration for speed
130
  max_samples = int(max_duration * sr)
@@ -134,7 +88,6 @@ def optimize_audio_input(audio_path, max_duration=10):
134
 
135
  # Save optimized audio
136
  optimized_path = audio_path.replace('.wav', '_opt.wav')
137
- import soundfile as sf
138
  sf.write(optimized_path, audio, sr)
139
  return optimized_path
140
 
@@ -145,12 +98,6 @@ def optimize_audio_input(audio_path, max_duration=10):
145
  def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
146
  """OPTIMIZED voice cloning with performance improvements"""
147
  try:
148
- start_time = torch.cuda.Event(enable_timing=True) if DEVICE == "cuda" else None
149
- end_time = torch.cuda.Event(enable_timing=True) if DEVICE == "cuda" else None
150
-
151
- if start_time:
152
- start_time.record()
153
-
154
  print(f"🎭 OPTIMIZED Voice cloning: {language}")
155
 
156
  if not reference_audio or not input_audio:
@@ -161,109 +108,79 @@ def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
161
  return None, f"❌ XTTS failed: {MODEL_STATUS}"
162
  load_whisper_optimized()
163
 
164
- # OPTIMIZATION 6: Parallel processing where possible
165
- with ThreadPoolExecutor(max_workers=2) as executor:
166
- # Optimize input audios in parallel
167
- future_ref = executor.submit(optimize_audio_input, reference_audio)
168
- future_input = executor.submit(optimize_audio_input, input_audio)
169
-
170
- ref_optimized = future_ref.result()
171
- input_optimized = future_input.result()
172
 
173
- # OPTIMIZATION 7: Fast transcription with limits
174
- extracted_text = "Voice cloning demo text."
175
  if WHISPER_MODEL:
176
  try:
177
- # Limit transcription time
178
  with torch.no_grad():
179
  result = WHISPER_MODEL.transcribe(
180
- input_optimized,
181
- fp16=(DEVICE == "cuda"), # Use FP16 if available
182
  language=language if language != 'auto' else None
183
  )
184
- text = result.get("text", "").strip()[:200] # Limit text length
185
  if text and len(text) > 10:
186
  extracted_text = text
187
- print(f"βœ… Fast transcription: '{extracted_text[:50]}...'")
188
  except Exception as e:
189
  print(f"⚠️ Transcription error: {e}")
190
 
191
- # OPTIMIZATION 8: Use cached embeddings
192
- gpt_cond_latent, speaker_embedding = get_cached_speaker_embeddings(ref_optimized)
193
-
194
  # Generate output
195
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
196
  output_path = tmp_file.name
197
 
198
  print("πŸš€ Generating optimized voice clone...")
199
 
200
- with patch_torch_load(), torch.no_grad(): # Disable gradient computation
201
- if gpt_cond_latent is not None and speaker_embedding is not None:
202
- # Use cached embeddings for faster inference
203
- TTS_MODEL.tts.tts_to_file(
204
- text=extracted_text,
205
- file_path=output_path,
206
- gpt_cond_latent=gpt_cond_latent,
207
- speaker_embedding=speaker_embedding,
208
- language=language,
209
- temperature=0.7, # Lower temperature for faster, more stable output
210
- length_penalty=1.0,
211
- repetition_penalty=5.0,
212
- top_k=50, # Limit choices for speed
213
- top_p=0.85
214
- )
215
- else:
216
- # Fallback to standard method
217
- TTS_MODEL.tts_to_file(
218
- text=extracted_text,
219
- speaker_wav=ref_optimized,
220
- language=language,
221
- file_path=output_path,
222
- temperature=0.7
223
- )
224
 
225
- # OPTIMIZATION 9: Memory cleanup
226
  if DEVICE == "cuda":
227
  torch.cuda.empty_cache()
228
  gc.collect()
229
 
230
- # Calculate timing
231
- processing_time = "N/A"
232
- if start_time and end_time:
233
- end_time.record()
234
- torch.cuda.synchronize()
235
- processing_time = f"{start_time.elapsed_time(end_time)/1000:.1f}s"
236
-
237
  # Verify output
238
  if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
239
  success_msg = f"""βœ… OPTIMIZED CLONING SUCCESS! ⚑
240
  πŸ“ Text: '{extracted_text[:100]}...'
241
- 🎭 Device: {DEVICE} | Time: {processing_time}
242
  πŸ”§ Status: {MODEL_STATUS}
243
  πŸ“Š Size: {os.path.getsize(output_path)/1024:.1f} KB
244
- πŸš€ Optimizations: Cached embeddings, FP16, Limited audio"""
245
 
246
  print("βœ… Optimized voice cloning completed!")
247
  return output_path, success_msg
248
  else:
249
- return None, "❌ Output file empty or too small!"
250
 
251
  except Exception as e:
252
  error_msg = f"❌ Optimized cloning error: {str(e)}"
253
  print(error_msg)
254
  return None, error_msg
255
 
256
- # OPTIMIZATION 10: Gradio with performance settings
257
  interface = gr.Interface(
258
  fn=voice_to_voice_clone_optimized,
259
  inputs=[
260
  gr.Audio(
261
- label="🎀 Reference Audio (Voice to Clone - Max 10s recommended)",
262
  type="filepath",
263
  sources=["upload"]
264
  ),
265
  gr.Audio(
266
- label="🎡 Input Audio (Content - Max 10s for speed)",
267
  type="filepath",
268
  sources=["upload"]
269
  ),
@@ -275,10 +192,10 @@ interface = gr.Interface(
275
  ],
276
  outputs=[
277
  gr.Audio(label="πŸŽ‰ Optimized Cloned Voice"),
278
- gr.Textbox(label="πŸ“Š Performance Stats", lines=10)
279
  ],
280
  title="πŸš€ HIGH-SPEED Voice Cloning Studio",
281
- description="⚑ Optimized XTTS-v2 with caching, FP16, and performance tuning. Use 5-10 second audio clips for fastest results!",
282
  theme=gr.themes.Soft(),
283
  allow_flagging="never",
284
  api_name="voice_to_voice_clone"
@@ -286,15 +203,17 @@ interface = gr.Interface(
286
 
287
  if __name__ == "__main__":
288
  print("🌐 Launching OPTIMIZED Voice Cloning Studio...")
289
- # OPTIMIZATION 11: Enable queue for better concurrency
 
290
  interface.queue(
291
- max_size=10, # Limit queue size
292
- api_open=True
 
293
  ).launch(
294
  server_name="0.0.0.0",
295
  server_port=7860,
296
  share=False,
297
  show_api=True,
298
- debug=False, # Disable debug for speed
299
- enable_queue=True
300
  )
 
5
  import os
6
  import warnings
7
  from contextlib import contextmanager
 
 
8
  import gc
9
 
10
  warnings.filterwarnings("ignore")
 
26
  # OPTIMIZATION 1: Hardware Detection and Setup
27
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
28
  if DEVICE == "cuda":
29
+ torch.backends.cudnn.benchmark = True
30
+ torch.backends.cuda.matmul.allow_tf32 = True
31
 
32
  print(f"πŸ”₯ Device: {DEVICE}")
 
 
 
33
 
34
  TTS_MODEL = None
35
  WHISPER_MODEL = None
36
  MODEL_STATUS = "Not Loaded"
37
+ SPEAKER_EMBEDDINGS_CACHE = {}
38
 
39
  def load_xtts_optimized():
 
40
  global TTS_MODEL, MODEL_STATUS
41
  if TTS_MODEL is not None:
42
  return True
 
47
 
48
  TTS_MODEL = TTS(
49
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
50
+ progress_bar=False,
51
  gpu=(DEVICE == "cuda")
52
  )
53
 
 
 
 
 
 
 
54
  MODEL_STATUS = "XTTS-v2 Optimized"
55
  print("βœ… XTTS loaded with optimizations!")
56
  return True
 
60
  return False
61
 
62
  def load_whisper_optimized():
 
63
  global WHISPER_MODEL
64
  if WHISPER_MODEL is not None:
65
  return True
66
  try:
67
  import whisper
 
68
  WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
69
  print("βœ… Whisper loaded (base model for speed)!")
70
  return True
 
72
  print(f"❌ Whisper failed: {e}")
73
  return False
74
 
75
+ def optimize_audio_input(audio_path, max_duration=15):
76
+ """Limit audio length for faster processing"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  try:
78
  import librosa
79
+ import soundfile as sf
80
+
81
+ audio, sr = librosa.load(audio_path, sr=22050)
82
 
83
  # Limit duration for speed
84
  max_samples = int(max_duration * sr)
 
88
 
89
  # Save optimized audio
90
  optimized_path = audio_path.replace('.wav', '_opt.wav')
 
91
  sf.write(optimized_path, audio, sr)
92
  return optimized_path
93
 
 
98
  def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
99
  """OPTIMIZED voice cloning with performance improvements"""
100
  try:
 
 
 
 
 
 
101
  print(f"🎭 OPTIMIZED Voice cloning: {language}")
102
 
103
  if not reference_audio or not input_audio:
 
108
  return None, f"❌ XTTS failed: {MODEL_STATUS}"
109
  load_whisper_optimized()
110
 
111
+ # Optimize input audios for speed
112
+ ref_optimized = optimize_audio_input(reference_audio, max_duration=15)
113
+ input_optimized = optimize_audio_input(input_audio, max_duration=20)
 
 
 
 
 
114
 
115
+ # Fast transcription with limits
116
+ extracted_text = "Voice cloning demonstration."
117
  if WHISPER_MODEL:
118
  try:
 
119
  with torch.no_grad():
120
  result = WHISPER_MODEL.transcribe(
121
+ input_optimized,
122
+ fp16=(DEVICE == "cuda"),
123
  language=language if language != 'auto' else None
124
  )
125
+ text = result.get("text", "").strip()[:300] # Limit text length
126
  if text and len(text) > 10:
127
  extracted_text = text
128
+ print(f"βœ… Extracted: '{extracted_text[:50]}...'")
129
  except Exception as e:
130
  print(f"⚠️ Transcription error: {e}")
131
 
 
 
 
132
  # Generate output
133
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
134
  output_path = tmp_file.name
135
 
136
  print("πŸš€ Generating optimized voice clone...")
137
 
138
+ with patch_torch_load(), torch.no_grad():
139
+ TTS_MODEL.tts_to_file(
140
+ text=extracted_text,
141
+ speaker_wav=ref_optimized,
142
+ language=language,
143
+ file_path=output_path,
144
+ temperature=0.7,
145
+ length_penalty=1.0,
146
+ repetition_penalty=5.0
147
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
+ # Memory cleanup
150
  if DEVICE == "cuda":
151
  torch.cuda.empty_cache()
152
  gc.collect()
153
 
 
 
 
 
 
 
 
154
  # Verify output
155
  if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
156
  success_msg = f"""βœ… OPTIMIZED CLONING SUCCESS! ⚑
157
  πŸ“ Text: '{extracted_text[:100]}...'
158
+ 🎭 Device: {DEVICE}
159
  πŸ”§ Status: {MODEL_STATUS}
160
  πŸ“Š Size: {os.path.getsize(output_path)/1024:.1f} KB
161
+ πŸš€ Optimizations: Limited audio, FP16, Memory cleanup"""
162
 
163
  print("βœ… Optimized voice cloning completed!")
164
  return output_path, success_msg
165
  else:
166
+ return None, "❌ Output file empty!"
167
 
168
  except Exception as e:
169
  error_msg = f"❌ Optimized cloning error: {str(e)}"
170
  print(error_msg)
171
  return None, error_msg
172
 
173
+ # Create Gradio interface
174
  interface = gr.Interface(
175
  fn=voice_to_voice_clone_optimized,
176
  inputs=[
177
  gr.Audio(
178
+ label="🎀 Reference Audio (Voice to Clone - Max 15s recommended)",
179
  type="filepath",
180
  sources=["upload"]
181
  ),
182
  gr.Audio(
183
+ label="🎡 Input Audio (Content - Max 20s for speed)",
184
  type="filepath",
185
  sources=["upload"]
186
  ),
 
192
  ],
193
  outputs=[
194
  gr.Audio(label="πŸŽ‰ Optimized Cloned Voice"),
195
+ gr.Textbox(label="πŸ“Š Performance Stats", lines=8)
196
  ],
197
  title="πŸš€ HIGH-SPEED Voice Cloning Studio",
198
+ description="⚑ Optimized XTTS-v2 with performance tuning. Use 10-20 second audio clips for fastest results (30-120 seconds processing time)!",
199
  theme=gr.themes.Soft(),
200
  allow_flagging="never",
201
  api_name="voice_to_voice_clone"
 
203
 
204
  if __name__ == "__main__":
205
  print("🌐 Launching OPTIMIZED Voice Cloning Studio...")
206
+
207
+ # FIXED: Correct queue configuration
208
  interface.queue(
209
+ max_size=5, # Limit queue size to prevent overload
210
+ api_open=True, # Allow API access
211
+ default_concurrency_limit=1 # Process one request at a time for stability
212
  ).launch(
213
  server_name="0.0.0.0",
214
  server_port=7860,
215
  share=False,
216
  show_api=True,
217
+ debug=False # Disable debug for speed
218
+ # REMOVED: enable_queue=True (this was causing the error)
219
  )