crackuser commited on
Commit
ba703e9
Β·
verified Β·
1 Parent(s): 5f03eaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -51
app.py CHANGED
@@ -11,7 +11,7 @@ import soundfile as sf
11
 
12
  warnings.filterwarnings("ignore")
13
  os.environ["COQUI_TOS_AGREED"] = "1"
14
- print("πŸš€ Starting CORRECTED Voice Cloning Studio...")
15
 
16
  @contextmanager
17
  def patch_torch_load():
@@ -35,7 +35,6 @@ WHISPER_MODEL = None
35
  MODEL_STATUS = "Not Loaded"
36
 
37
  def load_xtts_optimized():
38
- """Load XTTS model with optimizations"""
39
  global TTS_MODEL, MODEL_STATUS
40
  if TTS_MODEL is not None:
41
  return True
@@ -43,13 +42,11 @@ def load_xtts_optimized():
43
  with patch_torch_load():
44
  from TTS.api import TTS
45
  print("πŸ“¦ Loading XTTS...")
46
-
47
  TTS_MODEL = TTS(
48
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
49
  progress_bar=False,
50
  gpu=(DEVICE == "cuda")
51
  )
52
-
53
  MODEL_STATUS = "XTTS-v2 Ready"
54
  print("βœ… XTTS loaded successfully!")
55
  return True
@@ -59,7 +56,6 @@ def load_xtts_optimized():
59
  return False
60
 
61
  def load_whisper_optimized():
62
- """Load Whisper model for transcription"""
63
  global WHISPER_MODEL
64
  if WHISPER_MODEL is not None:
65
  return True
@@ -72,26 +68,20 @@ def load_whisper_optimized():
72
  print(f"❌ Whisper failed: {e}")
73
  return False
74
 
75
- def optimize_audio_input(audio_path, max_duration=30):
76
- """Optimize audio file for processing"""
77
  try:
78
  if not os.path.exists(audio_path):
79
  print(f"⚠️ Audio file not found: {audio_path}")
80
  return audio_path
81
 
82
- # Load and optimize audio
83
  audio, sr = librosa.load(audio_path, sr=22050)
84
-
85
- # Trim duration if too long
86
  max_samples = int(max_duration * sr)
87
  if len(audio) > max_samples:
88
  audio = audio[:max_samples]
89
  print(f"πŸ”„ Audio trimmed to {max_duration}s")
90
 
91
- # Save optimized audio
92
  optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
93
  sf.write(optimized_path, audio, sr)
94
-
95
  print(f"βœ… Audio optimized: {optimized_path}")
96
  return optimized_path
97
 
@@ -100,12 +90,12 @@ def optimize_audio_input(audio_path, max_duration=30):
100
  return audio_path
101
 
102
  def safe_file_path(file_input, input_name="audio"):
103
- """Safely extract file path from various input formats"""
104
  try:
105
  if file_input is None:
106
  return None
107
 
108
- # If it's already a string path and exists
109
  if isinstance(file_input, str):
110
  if os.path.exists(file_input):
111
  return file_input
@@ -119,54 +109,54 @@ def safe_file_path(file_input, input_name="audio"):
119
  if file_path and os.path.exists(file_path):
120
  return file_path
121
 
122
- # If it's a dict-like object (from API)
123
  if hasattr(file_input, 'get'):
124
  file_path = file_input.get('name') or file_input.get('path')
125
  if file_path and os.path.exists(file_path):
126
  return file_path
127
 
128
- print(f"⚠️ Could not extract valid file path from {input_name}: {type(file_input)}")
129
  return None
130
 
131
  except Exception as e:
132
  print(f"❌ Error processing {input_name}: {e}")
133
  return None
134
 
135
- def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
136
- """CORRECTED voice cloning function with proper error handling"""
137
  try:
138
  print(f"🎭 Voice cloning request: {language}")
139
  print(f"πŸ“ Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
140
 
141
- # CRITICAL: Safely extract file paths
142
  reference_path = safe_file_path(reference_audio, "reference")
143
  input_path = safe_file_path(input_audio, "input")
144
 
145
  if not reference_path:
146
- return None, "❌ Could not process reference audio. Please upload a valid audio file."
147
 
148
  if not input_path:
149
- return None, "❌ Could not process input audio. Please upload a valid audio file."
150
 
151
  print(f"πŸ“ Processing files - Ref: {reference_path}, Input: {input_path}")
152
 
153
- # Validate files exist and have content
154
  if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
155
- return None, f"❌ Reference audio file is invalid or too small."
156
 
157
  if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
158
- return None, f"❌ Input audio file is invalid or too small."
159
 
160
  # Load models
161
  if not load_xtts_optimized():
162
- return None, f"❌ XTTS model loading failed: {MODEL_STATUS}"
163
 
164
  load_whisper_optimized()
165
 
166
  # Optimize audio files
167
  print("πŸ”„ Optimizing audio files...")
168
  ref_optimized = optimize_audio_input(reference_path, max_duration=20)
169
- input_optimized = optimize_audio_input(input_path, max_duration=30)
170
 
171
  # Transcribe input audio
172
  extracted_text = "This is a voice cloning demonstration."
@@ -181,7 +171,7 @@ def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
181
  )
182
  text = result.get("text", "").strip()
183
  if text and len(text) > 5:
184
- extracted_text = text[:500] # Limit text length
185
  print(f"βœ… Transcribed: '{extracted_text[:50]}...'")
186
  except Exception as e:
187
  print(f"⚠️ Transcription warning: {e}")
@@ -207,29 +197,29 @@ def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
207
  print(f"❌ TTS generation error: {tts_error}")
208
  return None, f"❌ Voice generation failed: {str(tts_error)}"
209
 
210
- # Clean up memory
211
  if DEVICE == "cuda":
212
  torch.cuda.empty_cache()
213
  gc.collect()
214
 
215
- # Validate output
216
  if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
217
  file_size_kb = os.path.getsize(output_path) / 1024
218
 
219
  success_message = f"""βœ… VOICE CLONING SUCCESS! πŸŽ‰
220
 
221
- πŸ“ Transcribed Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
222
- 🎭 Processing Device: {DEVICE}
223
- ⚑ Model Status: {MODEL_STATUS}
224
- πŸ“Š Output Size: {file_size_kb:.1f} KB
225
- 🌍 Language: {language.upper()}
226
- πŸ”§ Optimizations: Audio trimming, Memory cleanup"""
227
 
228
  print("βœ… Voice cloning completed successfully!")
 
 
229
  return output_path, success_message
230
 
231
  else:
232
- return None, "❌ Voice cloning failed - output file is empty or corrupted."
233
 
234
  except Exception as e:
235
  error_msg = f"❌ Voice cloning error: {str(e)}"
@@ -238,19 +228,17 @@ def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
238
  print("Full traceback:", traceback.format_exc())
239
  return None, error_msg
240
 
241
- # CORRECTED: Gradio interface with proper configuration
242
  interface = gr.Interface(
243
- fn=voice_to_voice_clone_corrected,
244
  inputs=[
245
  gr.Audio(
246
  label="🎀 Reference Audio (Voice to Clone)",
247
- type="filepath",
248
- sources=["upload"]
249
  ),
250
  gr.Audio(
251
  label="🎡 Input Audio (Content to Transform)",
252
- type="filepath",
253
- sources=["upload"]
254
  ),
255
  gr.Dropdown(
256
  choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
@@ -261,26 +249,26 @@ interface = gr.Interface(
261
  outputs=[
262
  gr.Audio(
263
  label="πŸŽ‰ Cloned Voice Result",
264
- type="filepath"
265
  ),
266
  gr.Textbox(
267
  label="πŸ“‹ Processing Status",
268
- lines=10,
269
- max_lines=15
270
  )
271
  ],
272
- title="🎭 AI Voice Cloning Studio - CORRECTED",
273
- description="Transform any voice using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each) for best results.",
274
  theme=gr.themes.Soft(),
275
  allow_flagging="never",
276
- api_name="voice_to_voice_clone"
277
  )
278
 
279
  if __name__ == "__main__":
280
- print("🌐 Launching CORRECTED Voice Cloning Studio...")
281
 
 
282
  interface.queue(
283
- max_size=3,
284
  api_open=True,
285
  default_concurrency_limit=1
286
  ).launch(
@@ -288,5 +276,5 @@ if __name__ == "__main__":
288
  server_port=7860,
289
  share=False,
290
  show_api=True,
291
- debug=True
292
  )
 
11
 
12
  warnings.filterwarnings("ignore")
13
  os.environ["COQUI_TOS_AGREED"] = "1"
14
+ print("πŸš€ Starting FINAL CORRECTED Voice Cloning Studio...")
15
 
16
  @contextmanager
17
  def patch_torch_load():
 
35
  MODEL_STATUS = "Not Loaded"
36
 
37
  def load_xtts_optimized():
 
38
  global TTS_MODEL, MODEL_STATUS
39
  if TTS_MODEL is not None:
40
  return True
 
42
  with patch_torch_load():
43
  from TTS.api import TTS
44
  print("πŸ“¦ Loading XTTS...")
 
45
  TTS_MODEL = TTS(
46
  model_name="tts_models/multilingual/multi-dataset/xtts_v2",
47
  progress_bar=False,
48
  gpu=(DEVICE == "cuda")
49
  )
 
50
  MODEL_STATUS = "XTTS-v2 Ready"
51
  print("βœ… XTTS loaded successfully!")
52
  return True
 
56
  return False
57
 
58
  def load_whisper_optimized():
 
59
  global WHISPER_MODEL
60
  if WHISPER_MODEL is not None:
61
  return True
 
68
  print(f"❌ Whisper failed: {e}")
69
  return False
70
 
71
+ def optimize_audio_input(audio_path, max_duration=25):
 
72
  try:
73
  if not os.path.exists(audio_path):
74
  print(f"⚠️ Audio file not found: {audio_path}")
75
  return audio_path
76
 
 
77
  audio, sr = librosa.load(audio_path, sr=22050)
 
 
78
  max_samples = int(max_duration * sr)
79
  if len(audio) > max_samples:
80
  audio = audio[:max_samples]
81
  print(f"πŸ”„ Audio trimmed to {max_duration}s")
82
 
 
83
  optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
84
  sf.write(optimized_path, audio, sr)
 
85
  print(f"βœ… Audio optimized: {optimized_path}")
86
  return optimized_path
87
 
 
90
  return audio_path
91
 
92
  def safe_file_path(file_input, input_name="audio"):
93
+ """Extract file path from various input formats"""
94
  try:
95
  if file_input is None:
96
  return None
97
 
98
+ # If it's already a string path
99
  if isinstance(file_input, str):
100
  if os.path.exists(file_input):
101
  return file_input
 
109
  if file_path and os.path.exists(file_path):
110
  return file_path
111
 
112
+ # If it's a dict-like object
113
  if hasattr(file_input, 'get'):
114
  file_path = file_input.get('name') or file_input.get('path')
115
  if file_path and os.path.exists(file_path):
116
  return file_path
117
 
118
+ print(f"⚠️ Could not extract file path from {input_name}: {type(file_input)}")
119
  return None
120
 
121
  except Exception as e:
122
  print(f"❌ Error processing {input_name}: {e}")
123
  return None
124
 
125
+ def voice_to_voice_clone_final(reference_audio, input_audio, language="en"):
126
+ """FINAL CORRECTED voice cloning function"""
127
  try:
128
  print(f"🎭 Voice cloning request: {language}")
129
  print(f"πŸ“ Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
130
 
131
+ # Extract file paths safely
132
  reference_path = safe_file_path(reference_audio, "reference")
133
  input_path = safe_file_path(input_audio, "input")
134
 
135
  if not reference_path:
136
+ return None, "❌ Could not process reference audio file."
137
 
138
  if not input_path:
139
+ return None, "❌ Could not process input audio file."
140
 
141
  print(f"πŸ“ Processing files - Ref: {reference_path}, Input: {input_path}")
142
 
143
+ # Validate files
144
  if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
145
+ return None, "❌ Reference audio file is invalid."
146
 
147
  if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
148
+ return None, "❌ Input audio file is invalid."
149
 
150
  # Load models
151
  if not load_xtts_optimized():
152
+ return None, f"❌ XTTS model failed: {MODEL_STATUS}"
153
 
154
  load_whisper_optimized()
155
 
156
  # Optimize audio files
157
  print("πŸ”„ Optimizing audio files...")
158
  ref_optimized = optimize_audio_input(reference_path, max_duration=20)
159
+ input_optimized = optimize_audio_input(input_path, max_duration=25)
160
 
161
  # Transcribe input audio
162
  extracted_text = "This is a voice cloning demonstration."
 
171
  )
172
  text = result.get("text", "").strip()
173
  if text and len(text) > 5:
174
+ extracted_text = text[:400]
175
  print(f"βœ… Transcribed: '{extracted_text[:50]}...'")
176
  except Exception as e:
177
  print(f"⚠️ Transcription warning: {e}")
 
197
  print(f"❌ TTS generation error: {tts_error}")
198
  return None, f"❌ Voice generation failed: {str(tts_error)}"
199
 
200
+ # Memory cleanup
201
  if DEVICE == "cuda":
202
  torch.cuda.empty_cache()
203
  gc.collect()
204
 
205
+ # Validate and return output
206
  if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
207
  file_size_kb = os.path.getsize(output_path) / 1024
208
 
209
  success_message = f"""βœ… VOICE CLONING SUCCESS! πŸŽ‰
210
 
211
+ πŸ“ Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
212
+ 🎭 Device: {DEVICE} | Model: {MODEL_STATUS}
213
+ πŸ“Š Output: {file_size_kb:.1f} KB | Language: {language.upper()}
214
+ πŸ”§ Optimizations Applied Successfully"""
 
 
215
 
216
  print("βœ… Voice cloning completed successfully!")
217
+
218
+ # CRITICAL FIX: Return file path directly for Gradio compatibility
219
  return output_path, success_message
220
 
221
  else:
222
+ return None, "❌ Voice cloning failed - output file is empty."
223
 
224
  except Exception as e:
225
  error_msg = f"❌ Voice cloning error: {str(e)}"
 
228
  print("Full traceback:", traceback.format_exc())
229
  return None, error_msg
230
 
231
+ # CRITICAL: Use gr.Interface (not Blocks) for better API compatibility
232
  interface = gr.Interface(
233
+ fn=voice_to_voice_clone_final,
234
  inputs=[
235
  gr.Audio(
236
  label="🎀 Reference Audio (Voice to Clone)",
237
+ type="filepath" # CRITICAL: Must be filepath for API compatibility
 
238
  ),
239
  gr.Audio(
240
  label="🎡 Input Audio (Content to Transform)",
241
+ type="filepath" # CRITICAL: Must be filepath for API compatibility
 
242
  ),
243
  gr.Dropdown(
244
  choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
 
249
  outputs=[
250
  gr.Audio(
251
  label="πŸŽ‰ Cloned Voice Result",
252
+ type="filepath" # CRITICAL: Must be filepath for proper return
253
  ),
254
  gr.Textbox(
255
  label="πŸ“‹ Processing Status",
256
+ lines=8
 
257
  )
258
  ],
259
+ title="🎭 AI Voice Cloning Studio - FINAL",
260
+ description="Transform voices using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each).",
261
  theme=gr.themes.Soft(),
262
  allow_flagging="never",
263
+ api_name="voice_to_voice_clone" # CRITICAL: API endpoint name
264
  )
265
 
266
  if __name__ == "__main__":
267
+ print("🌐 Launching FINAL CORRECTED Voice Cloning Studio...")
268
 
269
+ # CORRECTED: Proper queue configuration
270
  interface.queue(
271
+ max_size=2, # Reduced for stability
272
  api_open=True,
273
  default_concurrency_limit=1
274
  ).launch(
 
276
  server_port=7860,
277
  share=False,
278
  show_api=True,
279
+ debug=False # Disable debug for production
280
  )