Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 11, 2025

Commit

af41746

verified ·

1 Parent(s): f9abe8a

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -340

app.py CHANGED Viewed

@@ -5,23 +5,32 @@ import tempfile
 import os
 import warnings
 from contextlib import contextmanager
-import numpy as np
 warnings.filterwarnings("ignore")
 # CRITICAL: Coqui Terms of Service
 os.environ["COQUI_TOS_AGREED"] = "1"
-print("🚀 Starting Voice Cloning Studio...")
-# PyTorch 2.6 Compatibility Patch
 @contextmanager
-def patch_torch_load():
-    """Fix PyTorch 2.6 weights_only issue"""
     original_load = torch.load
     def patched_load(f, *args, **kwargs):
         kwargs['weights_only'] = False
         return original_load(f, *args, **kwargs)
     torch.load = patched_load
     try:
         yield
@@ -35,402 +44,201 @@ print(f"🚀 Using device: {DEVICE}")
 # Global variables
 TTS_MODEL = None
 WHISPER_MODEL = None
-MODEL_STATUS = "Not Loaded"
 def load_models():
-    """Load models with correct error handling"""
-    global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS
-    print("🔄 Loading models...")
-    # Load XTTS-v2
     if TTS_MODEL is None:
         try:
-            with patch_torch_load():
                 from TTS.api import TTS
-                print("📦 Loading XTTS-v2...")
                 TTS_MODEL = TTS(
                     model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                     progress_bar=True,
                     gpu=(DEVICE == "cuda")
                 )
-                MODEL_STATUS = "XTTS-v2 Ready"
-                print("✅ XTTS-v2 loaded successfully!")
-                # CRITICAL: Verify the model has the correct methods
-                if hasattr(TTS_MODEL, 'tts') and hasattr(TTS_MODEL, 'tts_to_file'):
-                    print("✅ Verified: TTS model has correct API methods")
-                else:
-                    print("❌ Warning: TTS model missing expected methods")
         except Exception as e:
-            print(f"❌ XTTS-v2 loading failed: {e}")
-            MODEL_STATUS = f"XTTS Load Failed: {str(e)}"
             return False
-    # Load Whisper
     if WHISPER_MODEL is None:
         try:
-            print("📦 Loading Whisper...")
             import whisper
             WHISPER_MODEL = whisper.load_model("base")
-            print("✅ Whisper loaded successfully!")
         except Exception as e:
-            print(f"❌ Whisper loading failed: {e}")
     return TTS_MODEL is not None
-def voice_to_voice_clone(reference_audio, input_audio, language="en"):
-    """
-    CORRECTED: Uses tts() method instead of generate()
-    """
     try:
-        # Input validation
-        if not reference_audio:
-            return None, "❌ Please upload reference audio!"
-        if not input_audio:
-            return None, "❌ Please upload input audio!"
-        print("🎤 Starting Voice-to-Voice Cloning...")
-        # Load models
         if not load_models():
-            return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}"
-        # Extract text from input audio
-        extracted_text = "Voice cloning demonstration using uploaded audio content."
         if WHISPER_MODEL:
             try:
-                print("📝 Transcribing input audio...")
                 result = WHISPER_MODEL.transcribe(input_audio)
-                text = result.get("text", "").strip()
-                if text and len(text) > 3:
-                    extracted_text = text
-                print(f"✅ Extracted: '{extracted_text[:100]}...'")
             except Exception as e:
-                print(f"⚠️ Whisper transcription failed: {e}")
-        # CRITICAL FIX: Use tts() method, not generate()
-        print("🎭 Generating speech with CORRECT XTTS API...")
-        try:
-            with patch_torch_load():
-                # METHOD 1: Use tts() method that returns numpy array
-                wav_array = TTS_MODEL.tts(
-                    text=extracted_text,
-                    speaker_wav=reference_audio,
-                    language=language
-                )
-                print(f"✅ Generated audio array with shape: {np.array(wav_array).shape}")
-                # Convert numpy array to tensor and save
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-                    output_path = tmp_file.name
-                # Convert to tensor and save
-                if isinstance(wav_array, np.ndarray):
-                    wav_tensor = torch.tensor(wav_array, dtype=torch.float32).unsqueeze(0)
-                else:
-                    wav_tensor = torch.tensor(wav_array, dtype=torch.float32)
-                    if wav_tensor.dim() == 1:
-                        wav_tensor = wav_tensor.unsqueeze(0)
-                # Save with correct sample rate
-                sample_rate = getattr(TTS_MODEL, 'synthesizer', {}).get('output_sample_rate', 24000) or 24000
-                torchaudio.save(output_path, wav_tensor, sample_rate)
-                # Verify output
-                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-                    return output_path, f"✅ Voice-to-Voice Complete!\n\n🎤 Content: '{extracted_text[:150]}...'\n🎭 Applied reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}\n🔧 Used: tts() method (CORRECT API)"
-                else:
-                    return None, "❌ Generated audio file is empty!"
-        except Exception as gen_error:
-            # Fallback: Try tts_to_file method
-            try:
-                print("🔄 Trying fallback method: tts_to_file()...")
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-                    output_path = tmp_file.name
-                with patch_torch_load():
-                    TTS_MODEL.tts_to_file(
-                        text=extracted_text,
-                        speaker_wav=reference_audio,
-                        language=language,
-                        file_path=output_path
-                    )
-                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-                    return output_path, f"✅ Voice-to-Voice Complete (Fallback)!\n\n🎤 Content: '{extracted_text[:150]}...'\n🎭 Applied reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}\n🔧 Used: tts_to_file() method"
-                else:
-                    return None, "❌ Generated audio file is empty!"
-            except Exception as fallback_error:
-                return None, f"❌ Generation failed:\nPrimary error: {str(gen_error)}\nFallback error: {str(fallback_error)}\n\nTip: The model doesn't have a 'generate()' method. Use 'tts()' or 'tts_to_file()' instead."
     except Exception as e:
-        return None, f"❌ Voice-to-Voice Error: {str(e)}"
-def text_to_voice_clone(reference_audio, input_text, language="en"):
-    """
-    CORRECTED: Uses tts() method instead of generate()
-    """
     try:
-        # Input validation
-        if not reference_audio:
-            return None, "❌ Please upload reference audio!"
-        if not input_text or not input_text.strip():
-            return None, "❌ Please enter text to convert!"
-        print("📝 Starting Text-to-Voice Cloning...")
-        # Load models
-        if not load_models():
-            return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}"
-        print(f"🎭 Generating speech: '{input_text[:100]}...'")
-        try:
-            with patch_torch_load():
-                # METHOD 1: Use tts() method that returns numpy array
-                wav_array = TTS_MODEL.tts(
-                    text=input_text,
-                    speaker_wav=reference_audio,
-                    language=language
-                )
-                print(f"✅ Generated audio array with shape: {np.array(wav_array).shape}")
-                # Convert numpy array to tensor and save
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-                    output_path = tmp_file.name
-                # Convert to tensor and save
-                if isinstance(wav_array, np.ndarray):
-                    wav_tensor = torch.tensor(wav_array, dtype=torch.float32).unsqueeze(0)
-                else:
-                    wav_tensor = torch.tensor(wav_array, dtype=torch.float32)
-                    if wav_tensor.dim() == 1:
-                        wav_tensor = wav_tensor.unsqueeze(0)
-                # Save with correct sample rate
-                sample_rate = getattr(TTS_MODEL, 'synthesizer', {}).get('output_sample_rate', 24000) or 24000
-                torchaudio.save(output_path, wav_tensor, sample_rate)
-                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-                    return output_path, f"✅ Text-to-Voice Complete!\n\n📝 Generated: '{input_text[:150]}...'\n🎭 Using reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}\n🔧 Used: tts() method (CORRECT API)"
-                else:
-                    return None, "❌ Generated audio file is empty!"
-        except Exception as gen_error:
-            # Fallback: Try tts_to_file method
-            try:
-                print("🔄 Trying fallback method: tts_to_file()...")
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-                    output_path = tmp_file.name
-                with patch_torch_load():
-                    TTS_MODEL.tts_to_file(
-                        text=input_text,
-                        speaker_wav=reference_audio,
-                        language=language,
-                        file_path=output_path
-                    )
-                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-                    return output_path, f"✅ Text-to-Voice Complete (Fallback)!\n\n📝 Generated: '{input_text[:150]}...'\n🎭 Using reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}\n🔧 Used: tts_to_file() method"
-                else:
-                    return None, "❌ Generated audio file is empty!"
-            except Exception as fallback_error:
-                return None, f"❌ Generation failed:\nPrimary error: {str(gen_error)}\nFallback error: {str(fallback_error)}\n\nTip: The model doesn't have a 'generate()' method. Use 'tts()' or 'tts_to_file()' instead."
     except Exception as e:
-        return None, f"❌ Text-to-Voice Error: {str(e)}"
-# Initialize at startup
-print("🔄 Initializing models at startup...")
-try:
-    startup_success = load_models()
-    if startup_success:
-        startup_msg = f"✅ {MODEL_STATUS}!"
-        startup_color = "#d4edda"
-    else:
-        startup_msg = f"⚠️ Models will load on first use - {MODEL_STATUS}"
-        startup_color = "#fff3cd"
-except Exception as e:
-    startup_success = False
-    startup_msg = f"⚠️ Startup warning: {str(e)}"
-    startup_color = "#f8d7da"
-print(f"Startup status: {startup_msg}")
 # Create Gradio Interface
-with gr.Blocks(
-    title="🎭 Voice Cloning Studio - API Fixed",
-    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
-) as demo:
     gr.HTML("""
     <div style="text-align: center; padding: 20px;">
-        <h1 style="color: #2E86AB;">🎭 Voice Cloning Studio</h1>
-        <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
-        <p style="color: #888; font-size: 14px;">Fixed: Uses tts() method instead of generate() - No More API Errors!</p>
     </div>
     """)
-    # Status display
-    gr.HTML(f"""
-    <div style="text-align: center; padding: 15px; background: {startup_color}; border-radius: 10px; margin-bottom: 20px;">
-        <strong>🤖 Status:</strong> {startup_msg}
     </div>
     """)
-    # Reference voice section
-    gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>🎤 Reference Voice (Voice to Clone)</h3>")
     reference_audio = gr.Audio(
-        label="Upload Reference Audio (6+ seconds of clear speech)",
         type="filepath",
         sources=["upload", "microphone"]
     )
-    # Main tabs
     with gr.Tabs():
-        # Voice-to-Voice Tab
-        with gr.TabItem("🎵 Voice-to-Voice Cloning"):
-            gr.HTML("""
-            <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
-                <h4 style="color: #1e40af;">🎤 API Fixed - Now Uses Correct Methods:</h4>
-                <ul style="margin: 5px 0; padding-left: 20px;">
-                    <li>✅ Uses <code>model.tts()</code> method (correct)</li>
-                    <li>❌ No longer tries <code>model.generate()</code> (doesn't exist)</li>
-                    <li>🔄 Fallback to <code>model.tts_to_file()</code> if needed</li>
-                </ul>
-            </div>
-            """)
-            with gr.Row():
-                with gr.Column():
-                    input_audio = gr.Audio(
-                        label="Input Audio (Content to Transform)",
-                        type="filepath",
-                        sources=["upload", "microphone"]
-                    )
-                    voice_language = gr.Dropdown(
-                        choices=[
-                            ("🇺🇸 English", "en"),
-                            ("🇪🇸 Spanish", "es"),
-                            ("🇫🇷 French", "fr"),
-                            ("🇩🇪 German", "de")
-                        ],
-                        value="en",
-                        label="Language"
-                    )
-                    voice_btn = gr.Button(
-                        "🎤 Clone Voice (API Fixed)",
-                        variant="primary",
-                        size="lg"
-                    )
-                with gr.Column():
-                    voice_output = gr.Audio(label="Cloned Voice Result")
-                    voice_status = gr.Textbox(
-                        label="Status",
-                        lines=8,
-                        interactive=False
-                    )
-        # Text-to-Voice Tab
-        with gr.TabItem("📝 Text-to-Speech Cloning"):
-            with gr.Row():
-                with gr.Column():
-                    text_input = gr.Textbox(
-                        label="Text to Convert",
-                        placeholder="Enter text to speak in the cloned voice...",
-                        lines=5
-                    )
-                    text_language = gr.Dropdown(
-                        choices=[
-                            ("🇺🇸 English", "en"),
-                            ("🇪🇸 Spanish", "es"),
-                            ("🇫🇷 French", "fr"),
-                            ("🇩🇪 German", "de")
-                        ],
-                        value="en",
-                        label="Language"
-                    )
-                    text_btn = gr.Button(
-                        "📝 Generate Speech (API Fixed)",
-                        variant="secondary",
-                        size="lg"
-                    )
-                with gr.Column():
-                    text_output = gr.Audio(label="Generated Speech")
-                    text_status = gr.Textbox(
-                        label="Status",
-                        lines=8,
-                        interactive=False
-                    )
-    # Help section
-    with gr.Accordion("🔧 API Fix Explanation", open=False):
-        gr.Markdown("""
-        ### ✅ What Was Fixed
-        **The Problem:** Your code was trying to call `model.generate()` which doesn't exist on XTTS models.
-        **The Solution:**
-        - **Primary Method:** `model.tts()` - Returns numpy array that we convert and save
-        - **Fallback Method:** `model.tts_to_file()` - Saves directly to file
-        - **Removed:** All calls to `model.generate()` (doesn't exist)
-        ### 📋 XTTS API Reference
-        ```
-        # ✅ CORRECT - What we now use:
-        wav = model.tts(text=text, speaker_wav=reference_audio, language=language)
-        # ✅ ALTERNATIVE - Also works:
-        model.tts_to_file(text=text, speaker_wav=reference_audio, language=language, file_path=output)
-        # ❌ WRONG - What was causing the error:
-        model.generate()  # This method doesn't exist!
-        ```
-        ### 🚀 Expected Results
-        - **No More API Errors:** `'GPT2InferenceModel' object has no attribute 'generate'` is fixed
-        - **Working Voice Cloning:** Real audio transformation using correct XTTS methods
-        - **Robust Fallbacks:** If primary method fails, tries alternative approach
-        """)
-    # Event handlers
-    voice_btn.click(
-        fn=voice_to_voice_clone,
-        inputs=[reference_audio, input_audio, voice_language],
-        outputs=[voice_output, voice_status],
-        show_progress=True
-    )
-    text_btn.click(
-        fn=text_to_voice_clone,
-        inputs=[reference_audio, text_input, text_language],
-        outputs=[text_output, text_status],
-        show_progress=True
-    )
 if __name__ == "__main__":
     demo.launch()

 import os
 import warnings
 from contextlib import contextmanager
 warnings.filterwarnings("ignore")
 # CRITICAL: Coqui Terms of Service
 os.environ["COQUI_TOS_AGREED"] = "1"
+print("🚀 Starting Voice Cloning Studio with Fixed Package...")
+# PyTorch 2.6 Compatibility + Safe Globals Fix
 @contextmanager
+def fix_torch_load():
+    """Complete fix for PyTorch 2.6 and XTTS loading"""
     original_load = torch.load
     def patched_load(f, *args, **kwargs):
         kwargs['weights_only'] = False
         return original_load(f, *args, **kwargs)
+    # Add safe globals for XTTS classes
+    try:
+        from TTS.tts.configs.xtts_config import XttsConfig
+        from TTS.tts.configs.shared_configs import BaseDatasetConfig
+        torch.serialization.add_safe_globals([XttsConfig, BaseDatasetConfig])
+    except:
+        pass
     torch.load = patched_load
     try:
         yield
 # Global variables
 TTS_MODEL = None
 WHISPER_MODEL = None
 def load_models():
+    """Load models with the FIXED coqui-tts package"""
+    global TTS_MODEL, WHISPER_MODEL
     if TTS_MODEL is None:
         try:
+            with fix_torch_load():
+                # Use the FIXED coqui-tts package
                 from TTS.api import TTS
+                print("📦 Loading XTTS-v2 with FIXED package...")
                 TTS_MODEL = TTS(
                     model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                     progress_bar=True,
                     gpu=(DEVICE == "cuda")
                 )
+                print("✅ XTTS-v2 loaded with FIXED package!")
         except Exception as e:
+            print(f"❌ Model loading failed: {e}")
             return False
     if WHISPER_MODEL is None:
         try:
             import whisper
             WHISPER_MODEL = whisper.load_model("base")
+            print("✅ Whisper loaded!")
         except Exception as e:
+            print(f"❌ Whisper failed: {e}")
     return TTS_MODEL is not None
+def voice_clone(reference_audio, input_audio, language="en"):
+    """Voice cloning with COMPLETELY FIXED implementation"""
     try:
+        if not reference_audio or not input_audio:
+            return None, "❌ Upload both audio files!"
         if not load_models():
+            return None, "❌ Models failed to load! Check if coqui-tts package is installed correctly."
+        # Extract text using Whisper
+        text = "Voice cloning demonstration."
         if WHISPER_MODEL:
             try:
                 result = WHISPER_MODEL.transcribe(input_audio)
+                extracted = result.get("text", "").strip()
+                if extracted and len(extracted) > 3:
+                    text = extracted
+                print(f"✅ Extracted text: {text[:50]}...")
             except Exception as e:
+                print(f"⚠️ Whisper error: {e}")
+        # Generate speech using FIXED package
+        print("🎭 Generating speech with FIXED coqui-tts...")
+        with fix_torch_load():
+            # Use the correct API that works with the fixed package
+            wav = TTS_MODEL.tts(
+                text=text,
+                speaker_wav=reference_audio,
+                language=language
+            )
+        # Save audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            output_path = tmp.name
+        # Convert to tensor and save
+        wav_tensor = torch.FloatTensor(wav)
+        if wav_tensor.dim() == 1:
+            wav_tensor = wav_tensor.unsqueeze(0)
+        sample_rate = 22050  # Standard XTTS sample rate
+        torchaudio.save(output_path, wav_tensor, sample_rate)
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ SUCCESS with FIXED package!\n\n🎤 Text: {text[:100]}...\n🔧 Package: coqui-tts (maintained fork)\n📊 Language: {language}\n🎭 Voice cloning completed!"
+        else:
+            return None, "❌ Output file is empty!"
     except Exception as e:
+        return None, f"❌ Error: {str(e)}\n\n💡 Make sure you're using 'coqui-tts' package, not 'TTS'!"
+def text_clone(reference_audio, text, language="en"):
+    """Text-to-speech with COMPLETELY FIXED implementation"""
     try:
+        if not reference_audio or not text:
+            return None, "❌ Upload audio and enter text!"
+        if not load_models():
+            return None, "❌ Models failed to load! Check if coqui-tts package is installed correctly."
+        print(f"🎭 Generating speech for: {text[:50]}...")
+        with fix_torch_load():
+            wav = TTS_MODEL.tts(
+                text=text,
+                speaker_wav=reference_audio,
+                language=language
+            )
+        # Save audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            output_path = tmp.name
+        wav_tensor = torch.FloatTensor(wav)
+        if wav_tensor.dim() == 1:
+            wav_tensor = wav_tensor.unsqueeze(0)
+        torchaudio.save(output_path, wav_tensor, 22050)
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ SUCCESS with FIXED package!\n\n📝 Generated: {text[:100]}...\n🔧 Package: coqui-tts (maintained fork)\n📊 Language: {language}\n🎭 Text-to-speech completed!"
+        else:
+            return None, "❌ Output file is empty!"
     except Exception as e:
+        return None, f"❌ Error: {str(e)}\n\n💡 Make sure you're using 'coqui-tts' package, not 'TTS'!"
 # Create Gradio Interface
+with gr.Blocks(title="🎭 Voice Cloning - PACKAGE FIXED") as demo:
     gr.HTML("""
     <div style="text-align: center; padding: 20px;">
+        <h1>🎭 Voice Cloning Studio</h1>
+        <p style="color: #198754; font-weight: bold;">✅ FIXED: Now uses maintained 'coqui-tts' package!</p>
+        <p style="color: #666;">No more 'generate' method errors - completely resolved!</p>
     </div>
     """)
+    # Show the fix
+    gr.HTML("""
+    <div style="background: #d1ecf1; padding: 15px; border-radius: 8px; margin: 20px 0;">
+        <h4 style="color: #0c5460;">🔧 Problem Fixed!</h4>
+        <p><strong>Issue:</strong> Old TTS package had bugs causing 'generate' method errors</p>
+        <p><strong>Solution:</strong> Switched to maintained 'coqui-tts' fork that fixes this issue</p>
+        <p><strong>Result:</strong> Voice cloning now works without errors!</p>
     </div>
     """)
+    # Reference audio
     reference_audio = gr.Audio(
+        label="🎤 Reference Voice (Voice to Clone)",
         type="filepath",
         sources=["upload", "microphone"]
     )
     with gr.Tabs():
+        with gr.TabItem("🎵 Voice-to-Voice"):
+            input_audio = gr.Audio(
+                label="Input Audio (Content to Transform)",
+                type="filepath",
+                sources=["upload", "microphone"]
+            )
+            language1 = gr.Dropdown(
+                choices=[("English", "en"), ("Spanish", "es"), ("French", "fr")],
+                value="en",
+                label="Language"
+            )
+            btn1 = gr.Button("🎤 Clone Voice (FIXED Package)", variant="primary", size="lg")
+            output1 = gr.Audio(label="Cloned Voice Result")
+            status1 = gr.Textbox(label="Status", lines=6, interactive=False)
+            btn1.click(
+                fn=voice_clone,
+                inputs=[reference_audio, input_audio, language1],
+                outputs=[output1, status1]
+            )
+        with gr.TabItem("📝 Text-to-Speech"):
+            text_input = gr.Textbox(
+                label="Text to Convert",
+                lines=4,
+                placeholder="Enter text to speak in the cloned voice..."
+            )
+            language2 = gr.Dropdown(
+                choices=[("English", "en"), ("Spanish", "es"), ("French", "fr")],
+                value="en",
+                label="Language"
+            )
+            btn2 = gr.Button("📝 Generate Speech (FIXED Package)", variant="secondary", size="lg")
+            output2 = gr.Audio(label="Generated Speech Result")
+            status2 = gr.Textbox(label="Status", lines=6, interactive=False)
+            btn2.click(
+                fn=text_clone,
+                inputs=[reference_audio, text_input, language2],
+                outputs=[output2, status2]
+            )
 if __name__ == "__main__":
     demo.launch()