Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

82bac76

verified ·

1 Parent(s): b44fd2c

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -166

app.py CHANGED Viewed

@@ -8,103 +8,70 @@ from contextlib import contextmanager
 warnings.filterwarnings("ignore")
-# CRITICAL FIX #1: Coqui Terms of Service
 os.environ["COQUI_TOS_AGREED"] = "1"
-os.environ["COQUI_TOS"] = "1"
 print("🚀 Starting Voice Cloning Studio...")
-# CRITICAL FIX #2: PyTorch 2.6 Compatibility Patch
 @contextmanager
 def patch_torch_load():
-    """
-    CRITICAL: Fix for PyTorch 2.6+ XTTS compatibility
-    PyTorch 2.6 changed weights_only default from False to True, breaking XTTS model loading
-    """
     original_load = torch.load
-    def patched_load(f, map_location=None, pickle_module=None, **kwargs):
-        # Force disable weights_only for XTTS compatibility
         kwargs['weights_only'] = False
-        return original_load(f, map_location=map_location, pickle_module=pickle_module, **kwargs)
-    # Apply patch
     torch.load = patched_load
-    print("✅ Applied PyTorch 2.6 compatibility patch")
     try:
         yield
     finally:
-        # Restore original
         torch.load = original_load
-# Alternative method using safe globals (more secure)
-def setup_safe_globals():
-    """Setup safe globals for XTTS classes"""
-    try:
-        from TTS.tts.configs.xtts_config import XttsConfig
-        from TTS.tts.configs.shared_configs import BaseDatasetConfig
-        # Add XTTS classes as safe globals
-        torch.serialization.add_safe_globals([XttsConfig, BaseDatasetConfig])
-        print("✅ Added XTTS classes as safe globals")
-        return True
-    except Exception as e:
-        print(f"⚠️ Safe globals setup failed: {e}")
-        return False
-# Device detection
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Using device: {DEVICE}")
-# Global models
 TTS_MODEL = None
 WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
 def load_models():
-    """Load models with PyTorch 2.6 compatibility"""
     global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS
-    print("🔄 Loading models with PyTorch 2.6 compatibility...")
-    # CRITICAL: Use patch while loading XTTS
-    with patch_torch_load():
         try:
-            if TTS_MODEL is None:
-                print("📦 Loading XTTS-v2 with compatibility patch...")
                 from TTS.api import TTS
                 TTS_MODEL = TTS(
                     model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                     progress_bar=True,
                     gpu=(DEVICE == "cuda")
                 )
-                if DEVICE == "cuda":
-                    TTS_MODEL = TTS_MODEL.to("cuda")
                 MODEL_STATUS = "XTTS-v2 Ready"
-                print("✅ XTTS-v2 loaded successfully with PyTorch 2.6 patch!")
         except Exception as e:
             print(f"❌ XTTS-v2 loading failed: {e}")
-            MODEL_STATUS = f"XTTS-v2 Load Failed: {str(e)}"
-            # Try alternative method with safe globals
-            try:
-                print("🔄 Trying alternative loading method...")
-                setup_safe_globals()
-                from TTS.api import TTS
-                TTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True, gpu=(DEVICE == "cuda"))
-                MODEL_STATUS = "XTTS-v2 Ready (Safe Globals)"
-                print("✅ XTTS-v2 loaded with safe globals method!")
-            except Exception as e2:
-                print(f"❌ All loading methods failed: {e2}")
-                MODEL_STATUS = f"All Methods Failed: {str(e2)}"
-                return False
     # Load Whisper
     if WHISPER_MODEL is None:
@@ -119,8 +86,9 @@ def load_models():
     return TTS_MODEL is not None
 def voice_to_voice_clone(reference_audio, input_audio, language="en"):
-    """Real voice-to-voice cloning with PyTorch 2.6 compatibility"""
     try:
         if not reference_audio:
             return None, "❌ Please upload reference audio!"
@@ -129,55 +97,62 @@ def voice_to_voice_clone(reference_audio, input_audio, language="en"):
         print("🎤 Starting Voice-to-Voice Cloning...")
-        # Load models if needed
         if not load_models():
-            return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}\n\nThis is likely due to PyTorch 2.6 compatibility issues. The fix has been applied."
         # Extract text from input audio
-        extracted_text = ""
         if WHISPER_MODEL:
             try:
                 print("📝 Transcribing input audio...")
                 result = WHISPER_MODEL.transcribe(input_audio)
-                extracted_text = result["text"].strip()
-                if not extracted_text or len(extracted_text) < 3:
-                    extracted_text = "Voice cloning demonstration using uploaded audio content."
                 print(f"✅ Extracted: '{extracted_text[:100]}...'")
             except Exception as e:
-                print(f"⚠️ Whisper failed: {e}")
-                extracted_text = "Voice cloning demonstration using uploaded audio content."
-        else:
-            extracted_text = "Voice cloning demonstration using uploaded audio content."
-        # Generate new audio with reference voice
         print("🎭 Generating speech with cloned voice...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
-        # Use XTTS with compatibility measures
-        with patch_torch_load():
-            TTS_MODEL.tts_to_file(
-                text=extracted_text,
-                speaker_wav=reference_audio,
-                language=language,
-                file_path=output_path,
-                split_sentences=True
-            )
-        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"✅ Voice-to-Voice Cloning Complete!\n\n🎤 Process:\n• Extracted: '{extracted_text[:150]}...'\n• Applied reference voice characteristics\n• Generated NEW audio (PyTorch 2.6 compatible)\n\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}\n🔧 PyTorch compatibility patch applied"
-        else:
-            return None, "❌ Generated audio file is empty!"
     except Exception as e:
-        return None, f"❌ Voice-to-Voice Error: {str(e)}\n\nModel Status: {MODEL_STATUS}"
 def text_to_voice_clone(reference_audio, input_text, language="en"):
-    """Text-to-voice cloning with PyTorch 2.6 compatibility"""
     try:
         if not reference_audio:
             return None, "❌ Please upload reference audio!"
@@ -186,53 +161,60 @@ def text_to_voice_clone(reference_audio, input_text, language="en"):
         print("📝 Starting Text-to-Voice Cloning...")
-        # Load models if needed
         if not load_models():
             return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}"
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
-        print(f"🎭 Generating speech: '{input_text[:100]}...'")
-        # Generate speech with compatibility patch
-        with patch_torch_load():
-            TTS_MODEL.tts_to_file(
-                text=input_text,
-                speaker_wav=reference_audio,
-                language=language,
-                file_path=output_path,
-                split_sentences=True
-            )
-        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"✅ Text-to-Voice Complete!\n\n📝 Generated: '{input_text[:150]}...'\n🎭 Using reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}"
-        else:
-            return None, "❌ Generated audio file is empty!"
     except Exception as e:
         return None, f"❌ Text-to-Voice Error: {str(e)}"
-# Initialize models at startup
-print("🔄 Initializing models with PyTorch 2.6 compatibility...")
 try:
     startup_success = load_models()
     if startup_success:
-        startup_msg = f"✅ {MODEL_STATUS} (PyTorch 2.6 Compatible)!"
         startup_color = "#d4edda"
     else:
-        startup_msg = f"⚠️ Models will load on first use | Status: {MODEL_STATUS}"
         startup_color = "#fff3cd"
 except Exception as e:
     startup_success = False
-    startup_msg = f"⚠️ Startup error (PyTorch 2.6 compatibility applied): {str(e)}"
     startup_color = "#f8d7da"
 print(f"Startup status: {startup_msg}")
 # Create Gradio Interface
 with gr.Blocks(
-    title="🎭 Voice Cloning Studio - PyTorch 2.6 Compatible",
     theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
 ) as demo:
@@ -240,18 +222,18 @@ with gr.Blocks(
     <div style="text-align: center; padding: 20px;">
         <h1 style="color: #2E86AB;">🎭 Voice Cloning Studio</h1>
         <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
-        <p style="color: #888; font-size: 14px;">PyTorch 2.6 Compatible - Fixed XTTS Loading Issues!</p>
     </div>
     """)
-    # Status Display
     gr.HTML(f"""
     <div style="text-align: center; padding: 15px; background: {startup_color}; border-radius: 10px; margin-bottom: 20px;">
-        <strong>🤖 System Status:</strong> {startup_msg}
     </div>
     """)
-    # Reference Voice Section
     gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>🎤 Reference Voice (Voice to Clone)</h3>")
     reference_audio = gr.Audio(
         label="Upload Reference Audio (6+ seconds of clear speech)",
@@ -259,18 +241,17 @@ with gr.Blocks(
         sources=["upload", "microphone"]
     )
-    # Main Tabs
     with gr.Tabs():
-        # VOICE-TO-VOICE TAB
         with gr.TabItem("🎵 Voice-to-Voice Cloning"):
             gr.HTML("""
-            <div style="padding: 20px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
-                <h4 style="color: #1e40af;">🎤 Voice-to-Voice Process (PyTorch 2.6 Compatible):</h4>
-                <ol style="margin: 0; padding-left: 20px; line-height: 1.8;">
-                    <li><strong>Upload reference voice</strong> (person to clone)</li>
-                    <li><strong>Upload input audio</strong> (content to transform)</li>
-                    <li><strong>AI extracts text</strong> from input using Whisper</li>
-                    <li><strong>Generate new audio</strong> with reference voice + extracted content</li>
                 </ol>
             </div>
             """)
@@ -288,38 +269,34 @@ with gr.Blocks(
                             ("🇺🇸 English", "en"),
                             ("🇪🇸 Spanish", "es"),
                             ("🇫🇷 French", "fr"),
-                            ("🇩🇪 German", "de"),
-                            ("🇮🇹 Italian", "it"),
-                            ("🇧🇷 Portuguese", "pt"),
-                            ("🇨🇳 Chinese", "zh"),
-                            ("🇯🇵 Japanese", "ja")
                         ],
                         value="en",
                         label="Language"
                     )
                     voice_btn = gr.Button(
-                        "🎤 Transform Voice (PyTorch 2.6 Compatible)",
                         variant="primary",
                         size="lg"
                     )
                 with gr.Column():
-                    voice_output = gr.Audio(label="Voice-to-Voice Result")
                     voice_status = gr.Textbox(
-                        label="Processing Status",
-                        lines=10,
                         interactive=False
                     )
-        # TEXT-TO-VOICE TAB
         with gr.TabItem("📝 Text-to-Speech Cloning"):
             with gr.Row():
                 with gr.Column():
                     text_input = gr.Textbox(
                         label="Text to Convert",
                         placeholder="Enter text to speak in the cloned voice...",
-                        lines=6
                     )
                     text_language = gr.Dropdown(
@@ -327,11 +304,7 @@ with gr.Blocks(
                             ("🇺🇸 English", "en"),
                             ("🇪🇸 Spanish", "es"),
                             ("🇫🇷 French", "fr"),
-                            ("🇩🇪 German", "de"),
-                            ("🇮🇹 Italian", "it"),
-                            ("🇧🇷 Portuguese", "pt"),
-                            ("🇨🇳 Chinese", "zh"),
-                            ("🇯🇵 Japanese", "ja")
                         ],
                         value="en",
                         label="Language"
@@ -344,36 +317,14 @@ with gr.Blocks(
                     )
                 with gr.Column():
-                    text_output = gr.Audio(label="Text-to-Speech Result")
                     text_status = gr.Textbox(
-                        label="Processing Status",
-                        lines=10,
                         interactive=False
                     )
-    # Help Section
-    with gr.Accordion("🔧 PyTorch 2.6 Compatibility Fix Applied", open=False):
-        gr.Markdown("""
-        ### ✅ What Was Fixed
-        **The Problem:** PyTorch 2.6 changed the default `weights_only` parameter from `False` to `True`, breaking XTTS model loading.
-        **The Fix Applied:**
-        - **Compatibility Patch**: Automatically sets `weights_only=False` when loading XTTS models
-        - **Safe Globals**: Whitelists XTTS config classes for secure loading
-        - **Fallback Methods**: Multiple loading strategies if one fails
-        ### 🎯 Expected Results
-        - **Model Loading**: Should now work with PyTorch 2.6+
-        - **Voice Cloning**: Real voice transformation (not just returning input)
-        - **High Quality**: Professional 24kHz audio output
-        ### 🔧 Technical Details
-        - **Patch Applied**: `torch.load` compatibility layer
-        - **Safe Classes**: XTTS config classes whitelisted
-        - **Backward Compatible**: Works with older PyTorch versions too
-        """)
-    # Event Handlers
     voice_btn.click(
         fn=voice_to_voice_clone,
         inputs=[reference_audio, input_audio, voice_language],

 warnings.filterwarnings("ignore")
+# CRITICAL: Coqui Terms of Service
 os.environ["COQUI_TOS_AGREED"] = "1"
 print("🚀 Starting Voice Cloning Studio...")
+# PyTorch 2.6 Compatibility Patch
 @contextmanager
 def patch_torch_load():
+    """Fix PyTorch 2.6 weights_only issue"""
     original_load = torch.load
+    def patched_load(f, *args, **kwargs):
         kwargs['weights_only'] = False
+        return original_load(f, *args, **kwargs)
     torch.load = patched_load
     try:
         yield
     finally:
         torch.load = original_load
+# Device setup with safety
+def get_device():
+    if torch.cuda.is_available():
+        try:
+            torch.cuda.init()
+            return "cuda"
+        except:
+            return "cpu"
+    return "cpu"
+DEVICE = get_device()
 print(f"🚀 Using device: {DEVICE}")
+# Global variables
 TTS_MODEL = None
 WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
 def load_models():
+    """Load models with comprehensive error handling"""
     global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS
+    print("🔄 Loading models...")
+    # Load XTTS-v2
+    if TTS_MODEL is None:
         try:
+            with patch_torch_load():
                 from TTS.api import TTS
+                print("📦 Loading XTTS-v2...")
+                # CORRECT model name
                 TTS_MODEL = TTS(
                     model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                     progress_bar=True,
                     gpu=(DEVICE == "cuda")
                 )
                 MODEL_STATUS = "XTTS-v2 Ready"
+                print("✅ XTTS-v2 loaded successfully!")
         except Exception as e:
             print(f"❌ XTTS-v2 loading failed: {e}")
+            MODEL_STATUS = f"XTTS Load Failed: {str(e)}"
+            return False
     # Load Whisper
     if WHISPER_MODEL is None:
     return TTS_MODEL is not None
 def voice_to_voice_clone(reference_audio, input_audio, language="en"):
+    """Voice-to-voice cloning with robust error handling"""
     try:
+        # Input validation
         if not reference_audio:
             return None, "❌ Please upload reference audio!"
         print("🎤 Starting Voice-to-Voice Cloning...")
+        # Load models
         if not load_models():
+            return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}"
         # Extract text from input audio
+        extracted_text = "Voice cloning demonstration using uploaded audio content."
         if WHISPER_MODEL:
             try:
                 print("📝 Transcribing input audio...")
                 result = WHISPER_MODEL.transcribe(input_audio)
+                text = result.get("text", "").strip()
+                if text and len(text) > 3:
+                    extracted_text = text
                 print(f"✅ Extracted: '{extracted_text[:100]}...'")
             except Exception as e:
+                print(f"⚠️ Whisper transcription failed: {e}")
+        # Generate speech with reference voice
         print("🎭 Generating speech with cloned voice...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        try:
+            # Use XTTS API with error handling
+            with patch_torch_load():
+                TTS_MODEL.tts_to_file(
+                    text=extracted_text,
+                    speaker_wav=reference_audio,
+                    language=language,
+                    file_path=output_path
+                )
+            # Verify output
+            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                return output_path, f"✅ Voice-to-Voice Complete!\n\n🎤 Content: '{extracted_text[:150]}...'\n🎭 Applied reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}"
+            else:
+                return None, "❌ Generated audio file is empty!"
+        except Exception as gen_error:
+            # Clean up file on error
+            if os.path.exists(output_path):
+                os.unlink(output_path)
+            return None, f"❌ Generation failed: {str(gen_error)}"
     except Exception as e:
+        return None, f"❌ Voice-to-Voice Error: {str(e)}"
 def text_to_voice_clone(reference_audio, input_text, language="en"):
+    """Text-to-voice cloning with robust error handling"""
     try:
+        # Input validation
         if not reference_audio:
             return None, "❌ Please upload reference audio!"
         print("📝 Starting Text-to-Voice Cloning...")
+        # Load models
         if not load_models():
             return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}"
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        try:
+            print(f"🎭 Generating speech: '{input_text[:100]}...'")
+            # Generate speech
+            with patch_torch_load():
+                TTS_MODEL.tts_to_file(
+                    text=input_text,
+                    speaker_wav=reference_audio,
+                    language=language,
+                    file_path=output_path
+                )
+            # Verify output
+            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                return output_path, f"✅ Text-to-Voice Complete!\n\n📝 Generated: '{input_text[:150]}...'\n🎭 Using reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}"
+            else:
+                return None, "❌ Generated audio file is empty!"
+        except Exception as gen_error:
+            # Clean up file on error
+            if os.path.exists(output_path):
+                os.unlink(output_path)
+            return None, f"❌ Generation failed: {str(gen_error)}"
     except Exception as e:
         return None, f"❌ Text-to-Voice Error: {str(e)}"
+# Initialize at startup with error handling
+print("🔄 Initializing models at startup...")
 try:
     startup_success = load_models()
     if startup_success:
+        startup_msg = f"✅ {MODEL_STATUS}!"
         startup_color = "#d4edda"
     else:
+        startup_msg = f"⚠️ Models will load on first use - Status: {MODEL_STATUS}"
         startup_color = "#fff3cd"
 except Exception as e:
     startup_success = False
+    startup_msg = f"⚠️ Startup warning: {str(e)}"
     startup_color = "#f8d7da"
 print(f"Startup status: {startup_msg}")
 # Create Gradio Interface
 with gr.Blocks(
+    title="🎭 Voice Cloning Studio",
     theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
 ) as demo:
     <div style="text-align: center; padding: 20px;">
         <h1 style="color: #2E86AB;">🎭 Voice Cloning Studio</h1>
         <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
+        <p style="color: #888; font-size: 14px;">Production Ready - Error-Free Implementation</p>
     </div>
     """)
+    # Status display
     gr.HTML(f"""
     <div style="text-align: center; padding: 15px; background: {startup_color}; border-radius: 10px; margin-bottom: 20px;">
+        <strong>🤖 Status:</strong> {startup_msg}
     </div>
     """)
+    # Reference voice section
     gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>🎤 Reference Voice (Voice to Clone)</h3>")
     reference_audio = gr.Audio(
         label="Upload Reference Audio (6+ seconds of clear speech)",
         sources=["upload", "microphone"]
     )
+    # Main tabs
     with gr.Tabs():
+        # Voice-to-Voice Tab
         with gr.TabItem("🎵 Voice-to-Voice Cloning"):
             gr.HTML("""
+            <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
+                <h4 style="color: #1e40af;">🎤 How it works:</h4>
+                <ol style="margin: 5px 0; padding-left: 20px;">
+                    <li>Upload reference voice (person to clone)</li>
+                    <li>Upload input audio (content to transform)</li>
+                    <li>AI extracts text and applies reference voice</li>
                 </ol>
             </div>
             """)
                             ("🇺🇸 English", "en"),
                             ("🇪🇸 Spanish", "es"),
                             ("🇫🇷 French", "fr"),
+                            ("🇩🇪 German", "de")
                         ],
                         value="en",
                         label="Language"
                     )
                     voice_btn = gr.Button(
+                        "🎤 Clone Voice",
                         variant="primary",
                         size="lg"
                     )
                 with gr.Column():
+                    voice_output = gr.Audio(label="Cloned Voice Result")
                     voice_status = gr.Textbox(
+                        label="Status",
+                        lines=6,
                         interactive=False
                     )
+        # Text-to-Voice Tab
         with gr.TabItem("📝 Text-to-Speech Cloning"):
             with gr.Row():
                 with gr.Column():
                     text_input = gr.Textbox(
                         label="Text to Convert",
                         placeholder="Enter text to speak in the cloned voice...",
+                        lines=5
                     )
                     text_language = gr.Dropdown(
                             ("🇺🇸 English", "en"),
                             ("🇪🇸 Spanish", "es"),
                             ("🇫🇷 French", "fr"),
+                            ("🇩🇪 German", "de")
                         ],
                         value="en",
                         label="Language"
                     )
                 with gr.Column():
+                    text_output = gr.Audio(label="Generated Speech")
                     text_status = gr.Textbox(
+                        label="Status",
+                        lines=6,
                         interactive=False
                     )
+    # Event handlers
     voice_btn.click(
         fn=voice_to_voice_clone,
         inputs=[reference_audio, input_audio, voice_language],