Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

crackuser commited on Sep 12, 2025

Commit

ba703e9

verified ·

1 Parent(s): 5f03eaa

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -51

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import soundfile as sf
 warnings.filterwarnings("ignore")
 os.environ["COQUI_TOS_AGREED"] = "1"
-print("🚀 Starting CORRECTED Voice Cloning Studio...")
 @contextmanager
 def patch_torch_load():
@@ -35,7 +35,6 @@ WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
 def load_xtts_optimized():
-    """Load XTTS model with optimizations"""
     global TTS_MODEL, MODEL_STATUS
     if TTS_MODEL is not None:
         return True
@@ -43,13 +42,11 @@ def load_xtts_optimized():
         with patch_torch_load():
             from TTS.api import TTS
             print("📦 Loading XTTS...")
             TTS_MODEL = TTS(
                 model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                 progress_bar=False,
                 gpu=(DEVICE == "cuda")
             )
             MODEL_STATUS = "XTTS-v2 Ready"
             print("✅ XTTS loaded successfully!")
             return True
@@ -59,7 +56,6 @@ def load_xtts_optimized():
         return False
 def load_whisper_optimized():
-    """Load Whisper model for transcription"""
     global WHISPER_MODEL
     if WHISPER_MODEL is not None:
         return True
@@ -72,26 +68,20 @@ def load_whisper_optimized():
         print(f"❌ Whisper failed: {e}")
         return False
-def optimize_audio_input(audio_path, max_duration=30):
-    """Optimize audio file for processing"""
     try:
         if not os.path.exists(audio_path):
             print(f"⚠️ Audio file not found: {audio_path}")
             return audio_path
-        # Load and optimize audio
         audio, sr = librosa.load(audio_path, sr=22050)
-        # Trim duration if too long
         max_samples = int(max_duration * sr)
         if len(audio) > max_samples:
             audio = audio[:max_samples]
             print(f"🔄 Audio trimmed to {max_duration}s")
-        # Save optimized audio
         optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
         sf.write(optimized_path, audio, sr)
         print(f"✅ Audio optimized: {optimized_path}")
         return optimized_path
@@ -100,12 +90,12 @@ def optimize_audio_input(audio_path, max_duration=30):
         return audio_path
 def safe_file_path(file_input, input_name="audio"):
-    """Safely extract file path from various input formats"""
     try:
         if file_input is None:
             return None
-        # If it's already a string path and exists
         if isinstance(file_input, str):
             if os.path.exists(file_input):
                 return file_input
@@ -119,54 +109,54 @@ def safe_file_path(file_input, input_name="audio"):
             if file_path and os.path.exists(file_path):
                 return file_path
-        # If it's a dict-like object (from API)
         if hasattr(file_input, 'get'):
             file_path = file_input.get('name') or file_input.get('path')
             if file_path and os.path.exists(file_path):
                 return file_path
-        print(f"⚠️ Could not extract valid file path from {input_name}: {type(file_input)}")
         return None
     except Exception as e:
         print(f"❌ Error processing {input_name}: {e}")
         return None
-def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
-    """CORRECTED voice cloning function with proper error handling"""
     try:
         print(f"🎭 Voice cloning request: {language}")
         print(f"📁 Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
-        # CRITICAL: Safely extract file paths
         reference_path = safe_file_path(reference_audio, "reference")
         input_path = safe_file_path(input_audio, "input")
         if not reference_path:
-            return None, "❌ Could not process reference audio. Please upload a valid audio file."
         if not input_path:
-            return None, "❌ Could not process input audio. Please upload a valid audio file."
         print(f"📁 Processing files - Ref: {reference_path}, Input: {input_path}")
-        # Validate files exist and have content
         if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
-            return None, f"❌ Reference audio file is invalid or too small."
         if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
-            return None, f"❌ Input audio file is invalid or too small."
         # Load models
         if not load_xtts_optimized():
-            return None, f"❌ XTTS model loading failed: {MODEL_STATUS}"
         load_whisper_optimized()
         # Optimize audio files
         print("🔄 Optimizing audio files...")
         ref_optimized = optimize_audio_input(reference_path, max_duration=20)
-        input_optimized = optimize_audio_input(input_path, max_duration=30)
         # Transcribe input audio
         extracted_text = "This is a voice cloning demonstration."
@@ -181,7 +171,7 @@ def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
                     )
                 text = result.get("text", "").strip()
                 if text and len(text) > 5:
-                    extracted_text = text[:500]  # Limit text length
                 print(f"✅ Transcribed: '{extracted_text[:50]}...'")
             except Exception as e:
                 print(f"⚠️ Transcription warning: {e}")
@@ -207,29 +197,29 @@ def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
             print(f"❌ TTS generation error: {tts_error}")
             return None, f"❌ Voice generation failed: {str(tts_error)}"
-        # Clean up memory
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
-        # Validate output
         if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
             file_size_kb = os.path.getsize(output_path) / 1024
             success_message = f"""✅ VOICE CLONING SUCCESS! 🎉
-📝 Transcribed Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
-🎭 Processing Device: {DEVICE}
-⚡ Model Status: {MODEL_STATUS}
-📊 Output Size: {file_size_kb:.1f} KB
-🌍 Language: {language.upper()}
-🔧 Optimizations: Audio trimming, Memory cleanup"""
             print("✅ Voice cloning completed successfully!")
             return output_path, success_message
         else:
-            return None, "❌ Voice cloning failed - output file is empty or corrupted."
     except Exception as e:
         error_msg = f"❌ Voice cloning error: {str(e)}"
@@ -238,19 +228,17 @@ def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
         print("Full traceback:", traceback.format_exc())
         return None, error_msg
-# CORRECTED: Gradio interface with proper configuration
 interface = gr.Interface(
-    fn=voice_to_voice_clone_corrected,
     inputs=[
         gr.Audio(
             label="🎤 Reference Audio (Voice to Clone)",
-            type="filepath",
-            sources=["upload"]
         ),
         gr.Audio(
             label="🎵 Input Audio (Content to Transform)",
-            type="filepath",
-            sources=["upload"]
         ),
         gr.Dropdown(
             choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
@@ -261,26 +249,26 @@ interface = gr.Interface(
     outputs=[
         gr.Audio(
             label="🎉 Cloned Voice Result",
-            type="filepath"
         ),
         gr.Textbox(
             label="📋 Processing Status",
-            lines=10,
-            max_lines=15
         )
     ],
-    title="🎭 AI Voice Cloning Studio - CORRECTED",
-    description="Transform any voice using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each) for best results.",
     theme=gr.themes.Soft(),
     allow_flagging="never",
-    api_name="voice_to_voice_clone"
 )
 if __name__ == "__main__":
-    print("🌐 Launching CORRECTED Voice Cloning Studio...")
     interface.queue(
-        max_size=3,
         api_open=True,
         default_concurrency_limit=1
     ).launch(
@@ -288,5 +276,5 @@ if __name__ == "__main__":
         server_port=7860,
         share=False,
         show_api=True,
-        debug=True
     )

 warnings.filterwarnings("ignore")
 os.environ["COQUI_TOS_AGREED"] = "1"
+print("🚀 Starting FINAL CORRECTED Voice Cloning Studio...")
 @contextmanager
 def patch_torch_load():
 MODEL_STATUS = "Not Loaded"
 def load_xtts_optimized():
     global TTS_MODEL, MODEL_STATUS
     if TTS_MODEL is not None:
         return True
         with patch_torch_load():
             from TTS.api import TTS
             print("📦 Loading XTTS...")
             TTS_MODEL = TTS(
                 model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                 progress_bar=False,
                 gpu=(DEVICE == "cuda")
             )
             MODEL_STATUS = "XTTS-v2 Ready"
             print("✅ XTTS loaded successfully!")
             return True
         return False
 def load_whisper_optimized():
     global WHISPER_MODEL
     if WHISPER_MODEL is not None:
         return True
         print(f"❌ Whisper failed: {e}")
         return False
+def optimize_audio_input(audio_path, max_duration=25):
     try:
         if not os.path.exists(audio_path):
             print(f"⚠️ Audio file not found: {audio_path}")
             return audio_path
         audio, sr = librosa.load(audio_path, sr=22050)
         max_samples = int(max_duration * sr)
         if len(audio) > max_samples:
             audio = audio[:max_samples]
             print(f"🔄 Audio trimmed to {max_duration}s")
         optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
         sf.write(optimized_path, audio, sr)
         print(f"✅ Audio optimized: {optimized_path}")
         return optimized_path
         return audio_path
 def safe_file_path(file_input, input_name="audio"):
+    """Extract file path from various input formats"""
     try:
         if file_input is None:
             return None
+        # If it's already a string path
         if isinstance(file_input, str):
             if os.path.exists(file_input):
                 return file_input
             if file_path and os.path.exists(file_path):
                 return file_path
+        # If it's a dict-like object
         if hasattr(file_input, 'get'):
             file_path = file_input.get('name') or file_input.get('path')
             if file_path and os.path.exists(file_path):
                 return file_path
+        print(f"⚠️ Could not extract file path from {input_name}: {type(file_input)}")
         return None
     except Exception as e:
         print(f"❌ Error processing {input_name}: {e}")
         return None
+def voice_to_voice_clone_final(reference_audio, input_audio, language="en"):
+    """FINAL CORRECTED voice cloning function"""
     try:
         print(f"🎭 Voice cloning request: {language}")
         print(f"📁 Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
+        # Extract file paths safely
         reference_path = safe_file_path(reference_audio, "reference")
         input_path = safe_file_path(input_audio, "input")
         if not reference_path:
+            return None, "❌ Could not process reference audio file."
         if not input_path:
+            return None, "❌ Could not process input audio file."
         print(f"📁 Processing files - Ref: {reference_path}, Input: {input_path}")
+        # Validate files
         if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
+            return None, "❌ Reference audio file is invalid."
         if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
+            return None, "❌ Input audio file is invalid."
         # Load models
         if not load_xtts_optimized():
+            return None, f"❌ XTTS model failed: {MODEL_STATUS}"
         load_whisper_optimized()
         # Optimize audio files
         print("🔄 Optimizing audio files...")
         ref_optimized = optimize_audio_input(reference_path, max_duration=20)
+        input_optimized = optimize_audio_input(input_path, max_duration=25)
         # Transcribe input audio
         extracted_text = "This is a voice cloning demonstration."
                     )
                 text = result.get("text", "").strip()
                 if text and len(text) > 5:
+                    extracted_text = text[:400]
                 print(f"✅ Transcribed: '{extracted_text[:50]}...'")
             except Exception as e:
                 print(f"⚠️ Transcription warning: {e}")
             print(f"❌ TTS generation error: {tts_error}")
             return None, f"❌ Voice generation failed: {str(tts_error)}"
+        # Memory cleanup
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
+        # Validate and return output
         if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
             file_size_kb = os.path.getsize(output_path) / 1024
             success_message = f"""✅ VOICE CLONING SUCCESS! 🎉
+📝 Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
+🎭 Device: {DEVICE} | Model: {MODEL_STATUS}
+📊 Output: {file_size_kb:.1f} KB | Language: {language.upper()}
+🔧 Optimizations Applied Successfully"""
             print("✅ Voice cloning completed successfully!")
+            # CRITICAL FIX: Return file path directly for Gradio compatibility
             return output_path, success_message
         else:
+            return None, "❌ Voice cloning failed - output file is empty."
     except Exception as e:
         error_msg = f"❌ Voice cloning error: {str(e)}"
         print("Full traceback:", traceback.format_exc())
         return None, error_msg
+# CRITICAL: Use gr.Interface (not Blocks) for better API compatibility
 interface = gr.Interface(
+    fn=voice_to_voice_clone_final,
     inputs=[
         gr.Audio(
             label="🎤 Reference Audio (Voice to Clone)",
+            type="filepath"  # CRITICAL: Must be filepath for API compatibility
         ),
         gr.Audio(
             label="🎵 Input Audio (Content to Transform)",
+            type="filepath"  # CRITICAL: Must be filepath for API compatibility
         ),
         gr.Dropdown(
             choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
     outputs=[
         gr.Audio(
             label="🎉 Cloned Voice Result",
+            type="filepath"  # CRITICAL: Must be filepath for proper return
         ),
         gr.Textbox(
             label="📋 Processing Status",
+            lines=8
         )
     ],
+    title="🎭 AI Voice Cloning Studio - FINAL",
+    description="Transform voices using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each).",
     theme=gr.themes.Soft(),
     allow_flagging="never",
+    api_name="voice_to_voice_clone"  # CRITICAL: API endpoint name
 )
 if __name__ == "__main__":
+    print("🌐 Launching FINAL CORRECTED Voice Cloning Studio...")
+    # CORRECTED: Proper queue configuration
     interface.queue(
+        max_size=2,  # Reduced for stability
         api_open=True,
         default_concurrency_limit=1
     ).launch(
         server_port=7860,
         share=False,
         show_api=True,
+        debug=False  # Disable debug for production
     )