crackuser commited on
Commit
5f03eaa
Β·
verified Β·
1 Parent(s): 3e9e2ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -76
app.py CHANGED
@@ -6,10 +6,12 @@ import os
6
  import warnings
7
  from contextlib import contextmanager
8
  import gc
 
 
9
 
10
  warnings.filterwarnings("ignore")
11
  os.environ["COQUI_TOS_AGREED"] = "1"
12
- print("πŸš€ Starting OPTIMIZED Voice Cloning Studio...")
13
 
14
  @contextmanager
15
  def patch_torch_load():
@@ -23,27 +25,24 @@ def patch_torch_load():
23
  finally:
24
  torch.load = original_load
25
 
26
- # OPTIMIZATION 1: Hardware Detection and Setup
27
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
28
- if DEVICE == "cuda":
29
- torch.backends.cudnn.benchmark = True
30
- torch.backends.cuda.matmul.allow_tf32 = True
31
-
32
  print(f"πŸ”₯ Device: {DEVICE}")
33
 
 
34
  TTS_MODEL = None
35
  WHISPER_MODEL = None
36
  MODEL_STATUS = "Not Loaded"
37
- SPEAKER_EMBEDDINGS_CACHE = {}
38
 
39
  def load_xtts_optimized():
 
40
  global TTS_MODEL, MODEL_STATUS
41
  if TTS_MODEL is not None:
42
  return True
43
  try:
44
  with patch_torch_load():
45
  from TTS.api import TTS
46
- print("πŸ“¦ Loading XTTS with optimizations...")
47
 
48
  TTS_MODEL = TTS(
49
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
@@ -51,136 +50,205 @@ def load_xtts_optimized():
51
  gpu=(DEVICE == "cuda")
52
  )
53
 
54
- MODEL_STATUS = "XTTS-v2 Optimized"
55
- print("βœ… XTTS loaded with optimizations!")
56
  return True
57
  except Exception as e:
58
  print(f"❌ XTTS loading failed: {e}")
59
- MODEL_STATUS = f"Failed: {str(e)}"
60
  return False
61
 
62
  def load_whisper_optimized():
 
63
  global WHISPER_MODEL
64
  if WHISPER_MODEL is not None:
65
  return True
66
  try:
67
  import whisper
68
  WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
69
- print("βœ… Whisper loaded (base model for speed)!")
70
  return True
71
  except Exception as e:
72
  print(f"❌ Whisper failed: {e}")
73
  return False
74
 
75
- def optimize_audio_input(audio_path, max_duration=15):
76
- """Limit audio length for faster processing"""
77
  try:
78
- import librosa
79
- import soundfile as sf
80
-
 
 
81
  audio, sr = librosa.load(audio_path, sr=22050)
82
 
83
- # Limit duration for speed
84
  max_samples = int(max_duration * sr)
85
  if len(audio) > max_samples:
86
  audio = audio[:max_samples]
87
- print(f"πŸ”„ Audio trimmed to {max_duration}s for speed")
88
 
89
  # Save optimized audio
90
- optimized_path = audio_path.replace('.wav', '_opt.wav')
91
  sf.write(optimized_path, audio, sr)
 
 
92
  return optimized_path
93
 
94
  except Exception as e:
95
  print(f"⚠️ Audio optimization failed: {e}")
96
  return audio_path
97
 
98
- def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
99
- """OPTIMIZED voice cloning with performance improvements"""
100
  try:
101
- print(f"🎭 OPTIMIZED Voice cloning: {language}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- if not reference_audio or not input_audio:
104
- return None, "❌ Upload both audio files!"
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Load models
107
  if not load_xtts_optimized():
108
- return None, f"❌ XTTS failed: {MODEL_STATUS}"
 
109
  load_whisper_optimized()
110
 
111
- # Optimize input audios for speed
112
- ref_optimized = optimize_audio_input(reference_audio, max_duration=15)
113
- input_optimized = optimize_audio_input(input_audio, max_duration=20)
 
114
 
115
- # Fast transcription with limits
116
- extracted_text = "Voice cloning demonstration."
117
  if WHISPER_MODEL:
118
  try:
 
119
  with torch.no_grad():
120
  result = WHISPER_MODEL.transcribe(
121
  input_optimized,
122
  fp16=(DEVICE == "cuda"),
123
  language=language if language != 'auto' else None
124
  )
125
- text = result.get("text", "").strip()[:300] # Limit text length
126
- if text and len(text) > 10:
127
- extracted_text = text
128
- print(f"βœ… Extracted: '{extracted_text[:50]}...'")
129
  except Exception as e:
130
- print(f"⚠️ Transcription error: {e}")
 
 
 
131
 
132
- # Generate output
133
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
134
  output_path = tmp_file.name
135
 
136
- print("πŸš€ Generating optimized voice clone...")
137
-
138
- with patch_torch_load(), torch.no_grad():
139
- TTS_MODEL.tts_to_file(
140
- text=extracted_text,
141
- speaker_wav=ref_optimized,
142
- language=language,
143
- file_path=output_path,
144
- temperature=0.7,
145
- length_penalty=1.0,
146
- repetition_penalty=5.0
147
- )
 
 
148
 
149
- # Memory cleanup
150
  if DEVICE == "cuda":
151
  torch.cuda.empty_cache()
152
  gc.collect()
153
 
154
- # Verify output
155
  if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
156
- success_msg = f"""βœ… OPTIMIZED CLONING SUCCESS! ⚑
157
- πŸ“ Text: '{extracted_text[:100]}...'
158
- 🎭 Device: {DEVICE}
159
- πŸ”§ Status: {MODEL_STATUS}
160
- πŸ“Š Size: {os.path.getsize(output_path)/1024:.1f} KB
161
- πŸš€ Optimizations: Limited audio, FP16, Memory cleanup"""
 
 
 
 
 
 
 
162
 
163
- print("βœ… Optimized voice cloning completed!")
164
- return output_path, success_msg
165
  else:
166
- return None, "❌ Output file empty!"
167
 
168
  except Exception as e:
169
- error_msg = f"❌ Optimized cloning error: {str(e)}"
170
  print(error_msg)
 
 
171
  return None, error_msg
172
 
173
- # Create Gradio interface
174
  interface = gr.Interface(
175
- fn=voice_to_voice_clone_optimized,
176
  inputs=[
177
  gr.Audio(
178
- label="🎀 Reference Audio (Voice to Clone - Max 15s recommended)",
179
  type="filepath",
180
  sources=["upload"]
181
  ),
182
  gr.Audio(
183
- label="🎡 Input Audio (Content - Max 20s for speed)",
184
  type="filepath",
185
  sources=["upload"]
186
  ),
@@ -191,29 +259,34 @@ interface = gr.Interface(
191
  )
192
  ],
193
  outputs=[
194
- gr.Audio(label="πŸŽ‰ Optimized Cloned Voice"),
195
- gr.Textbox(label="πŸ“Š Performance Stats", lines=8)
 
 
 
 
 
 
 
196
  ],
197
- title="πŸš€ HIGH-SPEED Voice Cloning Studio",
198
- description="⚑ Optimized XTTS-v2 with performance tuning. Use 10-20 second audio clips for fastest results (30-120 seconds processing time)!",
199
  theme=gr.themes.Soft(),
200
  allow_flagging="never",
201
  api_name="voice_to_voice_clone"
202
  )
203
 
204
  if __name__ == "__main__":
205
- print("🌐 Launching OPTIMIZED Voice Cloning Studio...")
206
 
207
- # FIXED: Correct queue configuration
208
  interface.queue(
209
- max_size=5, # Limit queue size to prevent overload
210
- api_open=True, # Allow API access
211
- default_concurrency_limit=1 # Process one request at a time for stability
212
  ).launch(
213
  server_name="0.0.0.0",
214
  server_port=7860,
215
  share=False,
216
  show_api=True,
217
- debug=False # Disable debug for speed
218
- # REMOVED: enable_queue=True (this was causing the error)
219
  )
 
6
  import warnings
7
  from contextlib import contextmanager
8
  import gc
9
+ import librosa
10
+ import soundfile as sf
11
 
12
  warnings.filterwarnings("ignore")
13
  os.environ["COQUI_TOS_AGREED"] = "1"
14
+ print("πŸš€ Starting CORRECTED Voice Cloning Studio...")
15
 
16
  @contextmanager
17
  def patch_torch_load():
 
25
  finally:
26
  torch.load = original_load
27
 
28
+ # Hardware setup
29
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
30
  print(f"πŸ”₯ Device: {DEVICE}")
31
 
32
+ # Global model variables
33
  TTS_MODEL = None
34
  WHISPER_MODEL = None
35
  MODEL_STATUS = "Not Loaded"
 
36
 
37
  def load_xtts_optimized():
38
+ """Load XTTS model with optimizations"""
39
  global TTS_MODEL, MODEL_STATUS
40
  if TTS_MODEL is not None:
41
  return True
42
  try:
43
  with patch_torch_load():
44
  from TTS.api import TTS
45
+ print("πŸ“¦ Loading XTTS...")
46
 
47
  TTS_MODEL = TTS(
48
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
 
50
  gpu=(DEVICE == "cuda")
51
  )
52
 
53
+ MODEL_STATUS = "XTTS-v2 Ready"
54
+ print("βœ… XTTS loaded successfully!")
55
  return True
56
  except Exception as e:
57
  print(f"❌ XTTS loading failed: {e}")
58
+ MODEL_STATUS = f"XTTS Failed: {str(e)}"
59
  return False
60
 
61
  def load_whisper_optimized():
62
+ """Load Whisper model for transcription"""
63
  global WHISPER_MODEL
64
  if WHISPER_MODEL is not None:
65
  return True
66
  try:
67
  import whisper
68
  WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
69
+ print("βœ… Whisper loaded!")
70
  return True
71
  except Exception as e:
72
  print(f"❌ Whisper failed: {e}")
73
  return False
74
 
75
+ def optimize_audio_input(audio_path, max_duration=30):
76
+ """Optimize audio file for processing"""
77
  try:
78
+ if not os.path.exists(audio_path):
79
+ print(f"⚠️ Audio file not found: {audio_path}")
80
+ return audio_path
81
+
82
+ # Load and optimize audio
83
  audio, sr = librosa.load(audio_path, sr=22050)
84
 
85
+ # Trim duration if too long
86
  max_samples = int(max_duration * sr)
87
  if len(audio) > max_samples:
88
  audio = audio[:max_samples]
89
+ print(f"πŸ”„ Audio trimmed to {max_duration}s")
90
 
91
  # Save optimized audio
92
+ optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
93
  sf.write(optimized_path, audio, sr)
94
+
95
+ print(f"βœ… Audio optimized: {optimized_path}")
96
  return optimized_path
97
 
98
  except Exception as e:
99
  print(f"⚠️ Audio optimization failed: {e}")
100
  return audio_path
101
 
102
+ def safe_file_path(file_input, input_name="audio"):
103
+ """Safely extract file path from various input formats"""
104
  try:
105
+ if file_input is None:
106
+ return None
107
+
108
+ # If it's already a string path and exists
109
+ if isinstance(file_input, str):
110
+ if os.path.exists(file_input):
111
+ return file_input
112
+ else:
113
+ print(f"⚠️ File path doesn't exist: {file_input}")
114
+ return None
115
+
116
+ # If it's a file object with name attribute
117
+ if hasattr(file_input, 'name'):
118
+ file_path = file_input.name
119
+ if file_path and os.path.exists(file_path):
120
+ return file_path
121
+
122
+ # If it's a dict-like object (from API)
123
+ if hasattr(file_input, 'get'):
124
+ file_path = file_input.get('name') or file_input.get('path')
125
+ if file_path and os.path.exists(file_path):
126
+ return file_path
127
+
128
+ print(f"⚠️ Could not extract valid file path from {input_name}: {type(file_input)}")
129
+ return None
130
+
131
+ except Exception as e:
132
+ print(f"❌ Error processing {input_name}: {e}")
133
+ return None
134
+
135
+ def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
136
+ """CORRECTED voice cloning function with proper error handling"""
137
+ try:
138
+ print(f"🎭 Voice cloning request: {language}")
139
+ print(f"πŸ“ Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
140
+
141
+ # CRITICAL: Safely extract file paths
142
+ reference_path = safe_file_path(reference_audio, "reference")
143
+ input_path = safe_file_path(input_audio, "input")
144
 
145
+ if not reference_path:
146
+ return None, "❌ Could not process reference audio. Please upload a valid audio file."
147
+
148
+ if not input_path:
149
+ return None, "❌ Could not process input audio. Please upload a valid audio file."
150
+
151
+ print(f"πŸ“ Processing files - Ref: {reference_path}, Input: {input_path}")
152
+
153
+ # Validate files exist and have content
154
+ if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
155
+ return None, f"❌ Reference audio file is invalid or too small."
156
+
157
+ if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
158
+ return None, f"❌ Input audio file is invalid or too small."
159
 
160
  # Load models
161
  if not load_xtts_optimized():
162
+ return None, f"❌ XTTS model loading failed: {MODEL_STATUS}"
163
+
164
  load_whisper_optimized()
165
 
166
+ # Optimize audio files
167
+ print("πŸ”„ Optimizing audio files...")
168
+ ref_optimized = optimize_audio_input(reference_path, max_duration=20)
169
+ input_optimized = optimize_audio_input(input_path, max_duration=30)
170
 
171
+ # Transcribe input audio
172
+ extracted_text = "This is a voice cloning demonstration."
173
  if WHISPER_MODEL:
174
  try:
175
+ print("🎀 Transcribing audio...")
176
  with torch.no_grad():
177
  result = WHISPER_MODEL.transcribe(
178
  input_optimized,
179
  fp16=(DEVICE == "cuda"),
180
  language=language if language != 'auto' else None
181
  )
182
+ text = result.get("text", "").strip()
183
+ if text and len(text) > 5:
184
+ extracted_text = text[:500] # Limit text length
185
+ print(f"βœ… Transcribed: '{extracted_text[:50]}...'")
186
  except Exception as e:
187
+ print(f"⚠️ Transcription warning: {e}")
188
+
189
+ # Generate cloned voice
190
+ print("πŸš€ Generating cloned voice...")
191
 
 
192
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
193
  output_path = tmp_file.name
194
 
195
+ try:
196
+ with patch_torch_load(), torch.no_grad():
197
+ TTS_MODEL.tts_to_file(
198
+ text=extracted_text,
199
+ speaker_wav=ref_optimized,
200
+ language=language,
201
+ file_path=output_path,
202
+ temperature=0.7,
203
+ length_penalty=1.0,
204
+ repetition_penalty=5.0
205
+ )
206
+ except Exception as tts_error:
207
+ print(f"❌ TTS generation error: {tts_error}")
208
+ return None, f"❌ Voice generation failed: {str(tts_error)}"
209
 
210
+ # Clean up memory
211
  if DEVICE == "cuda":
212
  torch.cuda.empty_cache()
213
  gc.collect()
214
 
215
+ # Validate output
216
  if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
217
+ file_size_kb = os.path.getsize(output_path) / 1024
218
+
219
+ success_message = f"""βœ… VOICE CLONING SUCCESS! πŸŽ‰
220
+
221
+ πŸ“ Transcribed Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
222
+ 🎭 Processing Device: {DEVICE}
223
+ ⚑ Model Status: {MODEL_STATUS}
224
+ πŸ“Š Output Size: {file_size_kb:.1f} KB
225
+ 🌍 Language: {language.upper()}
226
+ πŸ”§ Optimizations: Audio trimming, Memory cleanup"""
227
+
228
+ print("βœ… Voice cloning completed successfully!")
229
+ return output_path, success_message
230
 
 
 
231
  else:
232
+ return None, "❌ Voice cloning failed - output file is empty or corrupted."
233
 
234
  except Exception as e:
235
+ error_msg = f"❌ Voice cloning error: {str(e)}"
236
  print(error_msg)
237
+ import traceback
238
+ print("Full traceback:", traceback.format_exc())
239
  return None, error_msg
240
 
241
+ # CORRECTED: Gradio interface with proper configuration
242
  interface = gr.Interface(
243
+ fn=voice_to_voice_clone_corrected,
244
  inputs=[
245
  gr.Audio(
246
+ label="🎀 Reference Audio (Voice to Clone)",
247
  type="filepath",
248
  sources=["upload"]
249
  ),
250
  gr.Audio(
251
+ label="🎡 Input Audio (Content to Transform)",
252
  type="filepath",
253
  sources=["upload"]
254
  ),
 
259
  )
260
  ],
261
  outputs=[
262
+ gr.Audio(
263
+ label="πŸŽ‰ Cloned Voice Result",
264
+ type="filepath"
265
+ ),
266
+ gr.Textbox(
267
+ label="πŸ“‹ Processing Status",
268
+ lines=10,
269
+ max_lines=15
270
+ )
271
  ],
272
+ title="🎭 AI Voice Cloning Studio - CORRECTED",
273
+ description="Transform any voice using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each) for best results.",
274
  theme=gr.themes.Soft(),
275
  allow_flagging="never",
276
  api_name="voice_to_voice_clone"
277
  )
278
 
279
  if __name__ == "__main__":
280
+ print("🌐 Launching CORRECTED Voice Cloning Studio...")
281
 
 
282
  interface.queue(
283
+ max_size=3,
284
+ api_open=True,
285
+ default_concurrency_limit=1
286
  ).launch(
287
  server_name="0.0.0.0",
288
  server_port=7860,
289
  share=False,
290
  show_api=True,
291
+ debug=True
 
292
  )