Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 12, 2025

Commit

5f03eaa

verified ·

1 Parent(s): 3e9e2ab

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -76

app.py CHANGED Viewed

@@ -6,10 +6,12 @@ import os
 import warnings
 from contextlib import contextmanager
 import gc
 warnings.filterwarnings("ignore")
 os.environ["COQUI_TOS_AGREED"] = "1"
-print("🚀 Starting OPTIMIZED Voice Cloning Studio...")
 @contextmanager
 def patch_torch_load():
@@ -23,27 +25,24 @@ def patch_torch_load():
     finally:
         torch.load = original_load
-# OPTIMIZATION 1: Hardware Detection and Setup
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-if DEVICE == "cuda":
-    torch.backends.cudnn.benchmark = True
-    torch.backends.cuda.matmul.allow_tf32 = True
 print(f"🔥 Device: {DEVICE}")
 TTS_MODEL = None
 WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
-SPEAKER_EMBEDDINGS_CACHE = {}
 def load_xtts_optimized():
     global TTS_MODEL, MODEL_STATUS
     if TTS_MODEL is not None:
         return True
     try:
         with patch_torch_load():
             from TTS.api import TTS
-            print("📦 Loading XTTS with optimizations...")
             TTS_MODEL = TTS(
                 model_name="tts_models/multilingual/multi-dataset/xtts_v2",
@@ -51,136 +50,205 @@ def load_xtts_optimized():
                 gpu=(DEVICE == "cuda")
             )
-            MODEL_STATUS = "XTTS-v2 Optimized"
-            print("✅ XTTS loaded with optimizations!")
             return True
     except Exception as e:
         print(f"❌ XTTS loading failed: {e}")
-        MODEL_STATUS = f"Failed: {str(e)}"
         return False
 def load_whisper_optimized():
     global WHISPER_MODEL
     if WHISPER_MODEL is not None:
         return True
     try:
         import whisper
         WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
-        print("✅ Whisper loaded (base model for speed)!")
         return True
     except Exception as e:
         print(f"❌ Whisper failed: {e}")
         return False
-def optimize_audio_input(audio_path, max_duration=15):
-    """Limit audio length for faster processing"""
     try:
-        import librosa
-        import soundfile as sf
         audio, sr = librosa.load(audio_path, sr=22050)
-        # Limit duration for speed
         max_samples = int(max_duration * sr)
         if len(audio) > max_samples:
             audio = audio[:max_samples]
-            print(f"🔄 Audio trimmed to {max_duration}s for speed")
         # Save optimized audio
-        optimized_path = audio_path.replace('.wav', '_opt.wav')
         sf.write(optimized_path, audio, sr)
         return optimized_path
     except Exception as e:
         print(f"⚠️ Audio optimization failed: {e}")
         return audio_path
-def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
-    """OPTIMIZED voice cloning with performance improvements"""
     try:
-        print(f"🎭 OPTIMIZED Voice cloning: {language}")
-        if not reference_audio or not input_audio:
-            return None, "❌ Upload both audio files!"
         # Load models
         if not load_xtts_optimized():
-            return None, f"❌ XTTS failed: {MODEL_STATUS}"
         load_whisper_optimized()
-        # Optimize input audios for speed
-        ref_optimized = optimize_audio_input(reference_audio, max_duration=15)
-        input_optimized = optimize_audio_input(input_audio, max_duration=20)
-        # Fast transcription with limits
-        extracted_text = "Voice cloning demonstration."
         if WHISPER_MODEL:
             try:
                 with torch.no_grad():
                     result = WHISPER_MODEL.transcribe(
                         input_optimized,
                         fp16=(DEVICE == "cuda"),
                         language=language if language != 'auto' else None
                     )
-                text = result.get("text", "").strip()[:300]  # Limit text length
-                if text and len(text) > 10:
-                    extracted_text = text
-                print(f"✅ Extracted: '{extracted_text[:50]}...'")
             except Exception as e:
-                print(f"⚠️ Transcription error: {e}")
-        # Generate output
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
-        print("🚀 Generating optimized voice clone...")
-        with patch_torch_load(), torch.no_grad():
-            TTS_MODEL.tts_to_file(
-                text=extracted_text,
-                speaker_wav=ref_optimized,
-                language=language,
-                file_path=output_path,
-                temperature=0.7,
-                length_penalty=1.0,
-                repetition_penalty=5.0
-            )
-        # Memory cleanup
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
-        # Verify output
         if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
-            success_msg = f"""✅ OPTIMIZED CLONING SUCCESS! ⚡
-📝 Text: '{extracted_text[:100]}...'
-🎭 Device: {DEVICE}
-🔧 Status: {MODEL_STATUS}
-📊 Size: {os.path.getsize(output_path)/1024:.1f} KB
-🚀 Optimizations: Limited audio, FP16, Memory cleanup"""
-            print("✅ Optimized voice cloning completed!")
-            return output_path, success_msg
         else:
-            return None, "❌ Output file empty!"
     except Exception as e:
-        error_msg = f"❌ Optimized cloning error: {str(e)}"
         print(error_msg)
         return None, error_msg
-# Create Gradio interface
 interface = gr.Interface(
-    fn=voice_to_voice_clone_optimized,
     inputs=[
         gr.Audio(
-            label="🎤 Reference Audio (Voice to Clone - Max 15s recommended)",
             type="filepath",
             sources=["upload"]
         ),
         gr.Audio(
-            label="🎵 Input Audio (Content - Max 20s for speed)",
             type="filepath",
             sources=["upload"]
         ),
@@ -191,29 +259,34 @@ interface = gr.Interface(
         )
     ],
     outputs=[
-        gr.Audio(label="🎉 Optimized Cloned Voice"),
-        gr.Textbox(label="📊 Performance Stats", lines=8)
     ],
-    title="🚀 HIGH-SPEED Voice Cloning Studio",
-    description="⚡ Optimized XTTS-v2 with performance tuning. Use 10-20 second audio clips for fastest results (30-120 seconds processing time)!",
     theme=gr.themes.Soft(),
     allow_flagging="never",
     api_name="voice_to_voice_clone"
 )
 if __name__ == "__main__":
-    print("🌐 Launching OPTIMIZED Voice Cloning Studio...")
-    # FIXED: Correct queue configuration
     interface.queue(
-        max_size=5,  # Limit queue size to prevent overload
-        api_open=True,  # Allow API access
-        default_concurrency_limit=1  # Process one request at a time for stability
     ).launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
         show_api=True,
-        debug=False  # Disable debug for speed
-        # REMOVED: enable_queue=True (this was causing the error)
     )

 import warnings
 from contextlib import contextmanager
 import gc
+import librosa
+import soundfile as sf
 warnings.filterwarnings("ignore")
 os.environ["COQUI_TOS_AGREED"] = "1"
+print("🚀 Starting CORRECTED Voice Cloning Studio...")
 @contextmanager
 def patch_torch_load():
     finally:
         torch.load = original_load
+# Hardware setup
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🔥 Device: {DEVICE}")
+# Global model variables
 TTS_MODEL = None
 WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
 def load_xtts_optimized():
+    """Load XTTS model with optimizations"""
     global TTS_MODEL, MODEL_STATUS
     if TTS_MODEL is not None:
         return True
     try:
         with patch_torch_load():
             from TTS.api import TTS
+            print("📦 Loading XTTS...")
             TTS_MODEL = TTS(
                 model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                 gpu=(DEVICE == "cuda")
             )
+            MODEL_STATUS = "XTTS-v2 Ready"
+            print("✅ XTTS loaded successfully!")
             return True
     except Exception as e:
         print(f"❌ XTTS loading failed: {e}")
+        MODEL_STATUS = f"XTTS Failed: {str(e)}"
         return False
 def load_whisper_optimized():
+    """Load Whisper model for transcription"""
     global WHISPER_MODEL
     if WHISPER_MODEL is not None:
         return True
     try:
         import whisper
         WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
+        print("✅ Whisper loaded!")
         return True
     except Exception as e:
         print(f"❌ Whisper failed: {e}")
         return False
+def optimize_audio_input(audio_path, max_duration=30):
+    """Optimize audio file for processing"""
     try:
+        if not os.path.exists(audio_path):
+            print(f"⚠️ Audio file not found: {audio_path}")
+            return audio_path
+        # Load and optimize audio
         audio, sr = librosa.load(audio_path, sr=22050)
+        # Trim duration if too long
         max_samples = int(max_duration * sr)
         if len(audio) > max_samples:
             audio = audio[:max_samples]
+            print(f"🔄 Audio trimmed to {max_duration}s")
         # Save optimized audio
+        optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
         sf.write(optimized_path, audio, sr)
+        print(f"✅ Audio optimized: {optimized_path}")
         return optimized_path
     except Exception as e:
         print(f"⚠️ Audio optimization failed: {e}")
         return audio_path
+def safe_file_path(file_input, input_name="audio"):
+    """Safely extract file path from various input formats"""
     try:
+        if file_input is None:
+            return None
+        # If it's already a string path and exists
+        if isinstance(file_input, str):
+            if os.path.exists(file_input):
+                return file_input
+            else:
+                print(f"⚠️ File path doesn't exist: {file_input}")
+                return None
+        # If it's a file object with name attribute
+        if hasattr(file_input, 'name'):
+            file_path = file_input.name
+            if file_path and os.path.exists(file_path):
+                return file_path
+        # If it's a dict-like object (from API)
+        if hasattr(file_input, 'get'):
+            file_path = file_input.get('name') or file_input.get('path')
+            if file_path and os.path.exists(file_path):
+                return file_path
+        print(f"⚠️ Could not extract valid file path from {input_name}: {type(file_input)}")
+        return None
+    except Exception as e:
+        print(f"❌ Error processing {input_name}: {e}")
+        return None
+def voice_to_voice_clone_corrected(reference_audio, input_audio, language="en"):
+    """CORRECTED voice cloning function with proper error handling"""
+    try:
+        print(f"🎭 Voice cloning request: {language}")
+        print(f"📁 Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
+        # CRITICAL: Safely extract file paths
+        reference_path = safe_file_path(reference_audio, "reference")
+        input_path = safe_file_path(input_audio, "input")
+        if not reference_path:
+            return None, "❌ Could not process reference audio. Please upload a valid audio file."
+        if not input_path:
+            return None, "❌ Could not process input audio. Please upload a valid audio file."
+        print(f"📁 Processing files - Ref: {reference_path}, Input: {input_path}")
+        # Validate files exist and have content
+        if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
+            return None, f"❌ Reference audio file is invalid or too small."
+        if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
+            return None, f"❌ Input audio file is invalid or too small."
         # Load models
         if not load_xtts_optimized():
+            return None, f"❌ XTTS model loading failed: {MODEL_STATUS}"
         load_whisper_optimized()
+        # Optimize audio files
+        print("🔄 Optimizing audio files...")
+        ref_optimized = optimize_audio_input(reference_path, max_duration=20)
+        input_optimized = optimize_audio_input(input_path, max_duration=30)
+        # Transcribe input audio
+        extracted_text = "This is a voice cloning demonstration."
         if WHISPER_MODEL:
             try:
+                print("🎤 Transcribing audio...")
                 with torch.no_grad():
                     result = WHISPER_MODEL.transcribe(
                         input_optimized,
                         fp16=(DEVICE == "cuda"),
                         language=language if language != 'auto' else None
                     )
+                text = result.get("text", "").strip()
+                if text and len(text) > 5:
+                    extracted_text = text[:500]  # Limit text length
+                print(f"✅ Transcribed: '{extracted_text[:50]}...'")
             except Exception as e:
+                print(f"⚠️ Transcription warning: {e}")
+        # Generate cloned voice
+        print("🚀 Generating cloned voice...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        try:
+            with patch_torch_load(), torch.no_grad():
+                TTS_MODEL.tts_to_file(
+                    text=extracted_text,
+                    speaker_wav=ref_optimized,
+                    language=language,
+                    file_path=output_path,
+                    temperature=0.7,
+                    length_penalty=1.0,
+                    repetition_penalty=5.0
+                )
+        except Exception as tts_error:
+            print(f"❌ TTS generation error: {tts_error}")
+            return None, f"❌ Voice generation failed: {str(tts_error)}"
+        # Clean up memory
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
+        # Validate output
         if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
+            file_size_kb = os.path.getsize(output_path) / 1024
+            success_message = f"""✅ VOICE CLONING SUCCESS! 🎉
+📝 Transcribed Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
+🎭 Processing Device: {DEVICE}
+⚡ Model Status: {MODEL_STATUS}
+📊 Output Size: {file_size_kb:.1f} KB
+🌍 Language: {language.upper()}
+🔧 Optimizations: Audio trimming, Memory cleanup"""
+            print("✅ Voice cloning completed successfully!")
+            return output_path, success_message
         else:
+            return None, "❌ Voice cloning failed - output file is empty or corrupted."
     except Exception as e:
+        error_msg = f"❌ Voice cloning error: {str(e)}"
         print(error_msg)
+        import traceback
+        print("Full traceback:", traceback.format_exc())
         return None, error_msg
+# CORRECTED: Gradio interface with proper configuration
 interface = gr.Interface(
+    fn=voice_to_voice_clone_corrected,
     inputs=[
         gr.Audio(
+            label="🎤 Reference Audio (Voice to Clone)",
             type="filepath",
             sources=["upload"]
         ),
         gr.Audio(
+            label="🎵 Input Audio (Content to Transform)",
             type="filepath",
             sources=["upload"]
         ),
         )
     ],
     outputs=[
+        gr.Audio(
+            label="🎉 Cloned Voice Result",
+            type="filepath"
+        ),
+        gr.Textbox(
+            label="📋 Processing Status",
+            lines=10,
+            max_lines=15
+        )
     ],
+    title="🎭 AI Voice Cloning Studio - CORRECTED",
+    description="Transform any voice using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each) for best results.",
     theme=gr.themes.Soft(),
     allow_flagging="never",
     api_name="voice_to_voice_clone"
 )
 if __name__ == "__main__":
+    print("🌐 Launching CORRECTED Voice Cloning Studio...")
     interface.queue(
+        max_size=3,
+        api_open=True,
+        default_concurrency_limit=1
     ).launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
         show_api=True,
+        debug=True
     )