Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

825c475

verified ·

1 Parent(s): 82bac76

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -66

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import tempfile
 import os
 import warnings
 from contextlib import contextmanager
 warnings.filterwarnings("ignore")
@@ -27,17 +28,8 @@ def patch_torch_load():
     finally:
         torch.load = original_load
-# Device setup with safety
-def get_device():
-    if torch.cuda.is_available():
-        try:
-            torch.cuda.init()
-            return "cuda"
-        except:
-            return "cpu"
-    return "cpu"
-DEVICE = get_device()
 print(f"🚀 Using device: {DEVICE}")
 # Global variables
@@ -46,7 +38,7 @@ WHISPER_MODEL = None
 MODEL_STATUS = "Not Loaded"
 def load_models():
-    """Load models with comprehensive error handling"""
     global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS
     print("🔄 Loading models...")
@@ -58,7 +50,6 @@ def load_models():
                 from TTS.api import TTS
                 print("📦 Loading XTTS-v2...")
-                # CORRECT model name
                 TTS_MODEL = TTS(
                     model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                     progress_bar=True,
@@ -68,6 +59,12 @@ def load_models():
                 MODEL_STATUS = "XTTS-v2 Ready"
                 print("✅ XTTS-v2 loaded successfully!")
         except Exception as e:
             print(f"❌ XTTS-v2 loading failed: {e}")
             MODEL_STATUS = f"XTTS Load Failed: {str(e)}"
@@ -86,7 +83,9 @@ def load_models():
     return TTS_MODEL is not None
 def voice_to_voice_clone(reference_audio, input_audio, language="en"):
-    """Voice-to-voice cloning with robust error handling"""
     try:
         # Input validation
         if not reference_audio:
@@ -118,39 +117,73 @@ def voice_to_voice_clone(reference_audio, input_audio, language="en"):
             except Exception as e:
                 print(f"⚠️ Whisper transcription failed: {e}")
-        # Generate speech with reference voice
-        print("🎭 Generating speech with cloned voice...")
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            output_path = tmp_file.name
         try:
-            # Use XTTS API with error handling
             with patch_torch_load():
-                TTS_MODEL.tts_to_file(
                     text=extracted_text,
                     speaker_wav=reference_audio,
-                    language=language,
-                    file_path=output_path
                 )
-            # Verify output
-            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-                return output_path, f"✅ Voice-to-Voice Complete!\n\n🎤 Content: '{extracted_text[:150]}...'\n🎭 Applied reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}"
-            else:
-                return None, "❌ Generated audio file is empty!"
         except Exception as gen_error:
-            # Clean up file on error
-            if os.path.exists(output_path):
-                os.unlink(output_path)
-            return None, f"❌ Generation failed: {str(gen_error)}"
     except Exception as e:
         return None, f"❌ Voice-to-Voice Error: {str(e)}"
 def text_to_voice_clone(reference_audio, input_text, language="en"):
-    """Text-to-voice cloning with robust error handling"""
     try:
         # Input validation
         if not reference_audio:
@@ -165,37 +198,68 @@ def text_to_voice_clone(reference_audio, input_text, language="en"):
         if not load_models():
             return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}"
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            output_path = tmp_file.name
         try:
-            print(f"🎭 Generating speech: '{input_text[:100]}...'")
-            # Generate speech
             with patch_torch_load():
-                TTS_MODEL.tts_to_file(
                     text=input_text,
                     speaker_wav=reference_audio,
-                    language=language,
-                    file_path=output_path
                 )
-            # Verify output
-            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-                return output_path, f"✅ Text-to-Voice Complete!\n\n📝 Generated: '{input_text[:150]}...'\n🎭 Using reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}"
-            else:
-                return None, "❌ Generated audio file is empty!"
         except Exception as gen_error:
-            # Clean up file on error
-            if os.path.exists(output_path):
-                os.unlink(output_path)
-            return None, f"❌ Generation failed: {str(gen_error)}"
     except Exception as e:
         return None, f"❌ Text-to-Voice Error: {str(e)}"
-# Initialize at startup with error handling
 print("🔄 Initializing models at startup...")
 try:
     startup_success = load_models()
@@ -203,7 +267,7 @@ try:
         startup_msg = f"✅ {MODEL_STATUS}!"
         startup_color = "#d4edda"
     else:
-        startup_msg = f"⚠️ Models will load on first use - Status: {MODEL_STATUS}"
         startup_color = "#fff3cd"
 except Exception as e:
     startup_success = False
@@ -214,7 +278,7 @@ print(f"Startup status: {startup_msg}")
 # Create Gradio Interface
 with gr.Blocks(
-    title="🎭 Voice Cloning Studio",
     theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
 ) as demo:
@@ -222,7 +286,7 @@ with gr.Blocks(
     <div style="text-align: center; padding: 20px;">
         <h1 style="color: #2E86AB;">🎭 Voice Cloning Studio</h1>
         <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
-        <p style="color: #888; font-size: 14px;">Production Ready - Error-Free Implementation</p>
     </div>
     """)
@@ -247,12 +311,12 @@ with gr.Blocks(
         with gr.TabItem("🎵 Voice-to-Voice Cloning"):
             gr.HTML("""
             <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
-                <h4 style="color: #1e40af;">🎤 How it works:</h4>
-                <ol style="margin: 5px 0; padding-left: 20px;">
-                    <li>Upload reference voice (person to clone)</li>
-                    <li>Upload input audio (content to transform)</li>
-                    <li>AI extracts text and applies reference voice</li>
-                </ol>
             </div>
             """)
@@ -276,7 +340,7 @@ with gr.Blocks(
                     )
                     voice_btn = gr.Button(
-                        "🎤 Clone Voice",
                         variant="primary",
                         size="lg"
                     )
@@ -285,7 +349,7 @@ with gr.Blocks(
                     voice_output = gr.Audio(label="Cloned Voice Result")
                     voice_status = gr.Textbox(
                         label="Status",
-                        lines=6,
                         interactive=False
                     )
@@ -311,7 +375,7 @@ with gr.Blocks(
                     )
                     text_btn = gr.Button(
-                        "📝 Generate Speech",
                         variant="secondary",
                         size="lg"
                     )
@@ -320,10 +384,39 @@ with gr.Blocks(
                     text_output = gr.Audio(label="Generated Speech")
                     text_status = gr.Textbox(
                         label="Status",
-                        lines=6,
                         interactive=False
                     )
     # Event handlers
     voice_btn.click(
         fn=voice_to_voice_clone,

 import os
 import warnings
 from contextlib import contextmanager
+import numpy as np
 warnings.filterwarnings("ignore")
     finally:
         torch.load = original_load
+# Device setup
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Using device: {DEVICE}")
 # Global variables
 MODEL_STATUS = "Not Loaded"
 def load_models():
+    """Load models with correct error handling"""
     global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS
     print("🔄 Loading models...")
                 from TTS.api import TTS
                 print("📦 Loading XTTS-v2...")
                 TTS_MODEL = TTS(
                     model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                     progress_bar=True,
                 MODEL_STATUS = "XTTS-v2 Ready"
                 print("✅ XTTS-v2 loaded successfully!")
+                # CRITICAL: Verify the model has the correct methods
+                if hasattr(TTS_MODEL, 'tts') and hasattr(TTS_MODEL, 'tts_to_file'):
+                    print("✅ Verified: TTS model has correct API methods")
+                else:
+                    print("❌ Warning: TTS model missing expected methods")
         except Exception as e:
             print(f"❌ XTTS-v2 loading failed: {e}")
             MODEL_STATUS = f"XTTS Load Failed: {str(e)}"
     return TTS_MODEL is not None
 def voice_to_voice_clone(reference_audio, input_audio, language="en"):
+    """
+    CORRECTED: Uses tts() method instead of generate()
+    """
     try:
         # Input validation
         if not reference_audio:
             except Exception as e:
                 print(f"⚠️ Whisper transcription failed: {e}")
+        # CRITICAL FIX: Use tts() method, not generate()
+        print("🎭 Generating speech with CORRECT XTTS API...")
         try:
             with patch_torch_load():
+                # METHOD 1: Use tts() method that returns numpy array
+                wav_array = TTS_MODEL.tts(
                     text=extracted_text,
                     speaker_wav=reference_audio,
+                    language=language
                 )
+                print(f"✅ Generated audio array with shape: {np.array(wav_array).shape}")
+                # Convert numpy array to tensor and save
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                    output_path = tmp_file.name
+                # Convert to tensor and save
+                if isinstance(wav_array, np.ndarray):
+                    wav_tensor = torch.tensor(wav_array, dtype=torch.float32).unsqueeze(0)
+                else:
+                    wav_tensor = torch.tensor(wav_array, dtype=torch.float32)
+                    if wav_tensor.dim() == 1:
+                        wav_tensor = wav_tensor.unsqueeze(0)
+                # Save with correct sample rate
+                sample_rate = getattr(TTS_MODEL, 'synthesizer', {}).get('output_sample_rate', 24000) or 24000
+                torchaudio.save(output_path, wav_tensor, sample_rate)
+                # Verify output
+                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                    return output_path, f"✅ Voice-to-Voice Complete!\n\n🎤 Content: '{extracted_text[:150]}...'\n🎭 Applied reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}\n🔧 Used: tts() method (CORRECT API)"
+                else:
+                    return None, "❌ Generated audio file is empty!"
         except Exception as gen_error:
+            # Fallback: Try tts_to_file method
+            try:
+                print("🔄 Trying fallback method: tts_to_file()...")
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                    output_path = tmp_file.name
+                with patch_torch_load():
+                    TTS_MODEL.tts_to_file(
+                        text=extracted_text,
+                        speaker_wav=reference_audio,
+                        language=language,
+                        file_path=output_path
+                    )
+                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                    return output_path, f"✅ Voice-to-Voice Complete (Fallback)!\n\n🎤 Content: '{extracted_text[:150]}...'\n🎭 Applied reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}\n🔧 Used: tts_to_file() method"
+                else:
+                    return None, "❌ Generated audio file is empty!"
+            except Exception as fallback_error:
+                return None, f"❌ Generation failed:\nPrimary error: {str(gen_error)}\nFallback error: {str(fallback_error)}\n\nTip: The model doesn't have a 'generate()' method. Use 'tts()' or 'tts_to_file()' instead."
     except Exception as e:
         return None, f"❌ Voice-to-Voice Error: {str(e)}"
 def text_to_voice_clone(reference_audio, input_text, language="en"):
+    """
+    CORRECTED: Uses tts() method instead of generate()
+    """
     try:
         # Input validation
         if not reference_audio:
         if not load_models():
             return None, f"❌ Model loading failed!\nStatus: {MODEL_STATUS}"
+        print(f"🎭 Generating speech: '{input_text[:100]}...'")
         try:
             with patch_torch_load():
+                # METHOD 1: Use tts() method that returns numpy array
+                wav_array = TTS_MODEL.tts(
                     text=input_text,
                     speaker_wav=reference_audio,
+                    language=language
                 )
+                print(f"✅ Generated audio array with shape: {np.array(wav_array).shape}")
+                # Convert numpy array to tensor and save
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                    output_path = tmp_file.name
+                # Convert to tensor and save
+                if isinstance(wav_array, np.ndarray):
+                    wav_tensor = torch.tensor(wav_array, dtype=torch.float32).unsqueeze(0)
+                else:
+                    wav_tensor = torch.tensor(wav_array, dtype=torch.float32)
+                    if wav_tensor.dim() == 1:
+                        wav_tensor = wav_tensor.unsqueeze(0)
+                # Save with correct sample rate
+                sample_rate = getattr(TTS_MODEL, 'synthesizer', {}).get('output_sample_rate', 24000) or 24000
+                torchaudio.save(output_path, wav_tensor, sample_rate)
+                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                    return output_path, f"✅ Text-to-Voice Complete!\n\n📝 Generated: '{input_text[:150]}...'\n🎭 Using reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}\n🔧 Used: tts() method (CORRECT API)"
+                else:
+                    return None, "❌ Generated audio file is empty!"
         except Exception as gen_error:
+            # Fallback: Try tts_to_file method
+            try:
+                print("🔄 Trying fallback method: tts_to_file()...")
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                    output_path = tmp_file.name
+                with patch_torch_load():
+                    TTS_MODEL.tts_to_file(
+                        text=input_text,
+                        speaker_wav=reference_audio,
+                        language=language,
+                        file_path=output_path
+                    )
+                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                    return output_path, f"✅ Text-to-Voice Complete (Fallback)!\n\n📝 Generated: '{input_text[:150]}...'\n🎭 Using reference voice\n📊 Language: {language}\n🤖 Model: {MODEL_STATUS}\n🔧 Used: tts_to_file() method"
+                else:
+                    return None, "❌ Generated audio file is empty!"
+            except Exception as fallback_error:
+                return None, f"❌ Generation failed:\nPrimary error: {str(gen_error)}\nFallback error: {str(fallback_error)}\n\nTip: The model doesn't have a 'generate()' method. Use 'tts()' or 'tts_to_file()' instead."
     except Exception as e:
         return None, f"❌ Text-to-Voice Error: {str(e)}"
+# Initialize at startup
 print("🔄 Initializing models at startup...")
 try:
     startup_success = load_models()
         startup_msg = f"✅ {MODEL_STATUS}!"
         startup_color = "#d4edda"
     else:
+        startup_msg = f"⚠️ Models will load on first use - {MODEL_STATUS}"
         startup_color = "#fff3cd"
 except Exception as e:
     startup_success = False
 # Create Gradio Interface
 with gr.Blocks(
+    title="🎭 Voice Cloning Studio - API Fixed",
     theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
 ) as demo:
     <div style="text-align: center; padding: 20px;">
         <h1 style="color: #2E86AB;">🎭 Voice Cloning Studio</h1>
         <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
+        <p style="color: #888; font-size: 14px;">Fixed: Uses tts() method instead of generate() - No More API Errors!</p>
     </div>
     """)
         with gr.TabItem("🎵 Voice-to-Voice Cloning"):
             gr.HTML("""
             <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
+                <h4 style="color: #1e40af;">🎤 API Fixed - Now Uses Correct Methods:</h4>
+                <ul style="margin: 5px 0; padding-left: 20px;">
+                    <li>✅ Uses <code>model.tts()</code> method (correct)</li>
+                    <li>❌ No longer tries <code>model.generate()</code> (doesn't exist)</li>
+                    <li>🔄 Fallback to <code>model.tts_to_file()</code> if needed</li>
+                </ul>
             </div>
             """)
                     )
                     voice_btn = gr.Button(
+                        "🎤 Clone Voice (API Fixed)",
                         variant="primary",
                         size="lg"
                     )
                     voice_output = gr.Audio(label="Cloned Voice Result")
                     voice_status = gr.Textbox(
                         label="Status",
+                        lines=8,
                         interactive=False
                     )
                     )
                     text_btn = gr.Button(
+                        "📝 Generate Speech (API Fixed)",
                         variant="secondary",
                         size="lg"
                     )
                     text_output = gr.Audio(label="Generated Speech")
                     text_status = gr.Textbox(
                         label="Status",
+                        lines=8,
                         interactive=False
                     )
+    # Help section
+    with gr.Accordion("🔧 API Fix Explanation", open=False):
+        gr.Markdown("""
+        ### ✅ What Was Fixed
+        **The Problem:** Your code was trying to call `model.generate()` which doesn't exist on XTTS models.
+        **The Solution:**
+        - **Primary Method:** `model.tts()` - Returns numpy array that we convert and save
+        - **Fallback Method:** `model.tts_to_file()` - Saves directly to file
+        - **Removed:** All calls to `model.generate()` (doesn't exist)
+        ### 📋 XTTS API Reference
+        ```
+        # ✅ CORRECT - What we now use:
+        wav = model.tts(text=text, speaker_wav=reference_audio, language=language)
+        # ✅ ALTERNATIVE - Also works:
+        model.tts_to_file(text=text, speaker_wav=reference_audio, language=language, file_path=output)
+        # ❌ WRONG - What was causing the error:
+        model.generate()  # This method doesn't exist!
+        ```
+        ### 🚀 Expected Results
+        - **No More API Errors:** `'GPT2InferenceModel' object has no attribute 'generate'` is fixed
+        - **Working Voice Cloning:** Real audio transformation using correct XTTS methods
+        - **Robust Fallbacks:** If primary method fails, tries alternative approach
+        """)
     # Event handlers
     voice_btn.click(
         fn=voice_to_voice_clone,