Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

9a34a5d

verified ·

1 Parent(s): 2eaf615

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -302

app.py CHANGED Viewed

@@ -1,91 +1,85 @@
 import gradio as gr
 import torch
-import torchaudio as ta
 import tempfile
 import os
-from chatterbox.tts import ChatterboxTTS
-from chatterbox.mtl_tts import ChatterboxMultilingualTTS
-# Initialize Chatterbox models (the ones we actually discussed!)
-print("🔄 Loading Chatterbox TTS models...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-try:
-    # Load Chatterbox English model
-    english_model = ChatterboxTTS.from_pretrained(device=device)
-    print("✅ Chatterbox English model loaded!")
-    # Load Chatterbox Multilingual model
-    multilingual_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
-    print("✅ Chatterbox Multilingual model loaded!")
-    models_loaded = True
-except Exception as e:
-    print(f"❌ Error loading Chatterbox models: {e}")
-    english_model = None
-    multilingual_model = None
-    models_loaded = False
-def chatterbox_voice_clone(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
-    """
-    Real Voice-to-Voice cloning using Chatterbox (the model we discussed!)
-    """
-    try:
-        if not reference_audio or not input_audio:
-            return None, "❌ Please upload both reference and input audio files!"
-        if not models_loaded:
-            return None, "❌ Chatterbox models not loaded!"
-        # Extract text from input audio using Whisper
-        import whisper
         try:
-            whisper_model = whisper.load_model("base")
-            result = whisper_model.transcribe(input_audio)
-            input_text = result["text"]
-            print(f"📝 Extracted text: {input_text}")
-        except Exception as e:
-            input_text = "Voice cloning demonstration using Chatterbox AI technology."
-            print(f"⚠️ Whisper failed, using default text: {e}")
-        # Create output file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            output_path = tmp_file.name
-        # Use appropriate Chatterbox model based on language
-        if language == "en":
-            # Use English Chatterbox model
-            wav = english_model.generate(
-                input_text,
-                audio_prompt_path=reference_audio,
-                exaggeration=exaggeration,
-                cfg=cfg
-            )
-        else:
-            # Use Multilingual Chatterbox model
-            wav = multilingual_model.generate(
-                input_text,
-                audio_prompt_path=reference_audio,
-                language_id=language,
-                exaggeration=exaggeration,
-                cfg=cfg
-            )
-        # Save generated audio
-        ta.save(output_path, wav, english_model.sr if language == "en" else multilingual_model.sr)
-        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"✅ Chatterbox Voice Cloning Complete!\n🎵 Generated: '{input_text[:100]}...'\n🎛️ Settings: Exaggeration={exaggeration}, CFG={cfg}"
-        else:
-            return None, "❌ Failed to generate cloned audio!"
-    except Exception as e:
-        return None, f"❌ Chatterbox Error: {str(e)}"
-def chatterbox_text_to_speech(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5, speed=1.0):
-    """
-    Real Text-to-Speech with voice cloning using Chatterbox
-    """
     try:
         if not reference_audio:
             return None, "❌ Please upload reference audio!"
@@ -93,255 +87,155 @@ def chatterbox_text_to_speech(reference_audio, input_text, language="en", exagge
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
-        if not models_loaded:
-            return None, "❌ Chatterbox models not loaded!"
         print(f"🎤 Generating speech with Chatterbox...")
-        print(f"📝 Text: {input_text}")
-        print(f"🗣️ Language: {language}")
-        print(f"🎛️ Exaggeration: {exaggeration}, CFG: {cfg}")
         # Create output file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
-        # Use appropriate Chatterbox model
-        if language == "en":
-            # English Chatterbox model
-            wav = english_model.generate(
-                input_text,
-                audio_prompt_path=reference_audio,
-                exaggeration=exaggeration,
-                cfg=cfg
-            )
-        else:
-            # Multilingual Chatterbox model
-            wav = multilingual_model.generate(
                 input_text,
                 audio_prompt_path=reference_audio,
-                language_id=language,
-                exaggeration=exaggeration,
-                cfg=cfg
             )
-        # Save generated audio
-        ta.save(output_path, wav, english_model.sr if language == "en" else multilingual_model.sr)
-        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"✅ Chatterbox TTS Complete!\n📝 Generated: '{input_text[:100]}...'\n🎛️ Settings: Exaggeration={exaggeration}, CFG={cfg}"
-        else:
-            return None, "❌ Failed to generate speech!"
     except Exception as e:
-        return None, f"❌ Chatterbox Error: {str(e)}"
 # Create Gradio interface
-def create_chatterbox_interface():
-    with gr.Blocks(
-        title="🎭 Chatterbox Voice Cloning Studio",
-        theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
-    ) as demo:
-        # Header
-        gr.HTML("""
-        <div style="text-align: center; padding: 20px;">
-            <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Chatterbox Voice Cloning Studio</h1>
-            <p style="color: #666; font-size: 18px;">Powered by Resemble AI's Chatterbox - The Model We Discussed!</p>
-            <p style="color: #888; font-size: 14px;">✨ Emotion control • 23+ languages • Zero-shot cloning • MIT licensed</p>
-        </div>
-        """)
-        # Model Status
-        gr.HTML(f"""
-        <div style="text-align: center; padding: 10px; background: {'#d4edda' if models_loaded else '#f8d7da'}; border-radius: 10px; margin-bottom: 20px;">
-            <strong>🤖 Chatterbox Status:</strong> {'✅ Models Loaded Successfully!' if models_loaded else '❌ Models Not Loaded'}
-        </div>
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                # Reference Voice Section
-                gr.HTML("<h3 style='color: #8B5CF6;'>🎤 Reference Voice (5+ seconds)</h3>")
-                reference_audio = gr.Audio(
-                    label="Upload Reference Audio",
-                    type="filepath",
-                    sources=["upload", "microphone"]
-                )
-                gr.HTML("<p style='color: #666; font-size: 14px;'>📌 Upload clear speech from the voice you want to clone</p>")
-        with gr.Row():
-            with gr.Column(scale=1):
-                # Voice-to-Voice Cloning
-                gr.HTML("<h3 style='color: #8B5CF6;'>🎵 Voice-to-Voice Cloning</h3>")
-                input_audio = gr.Audio(
-                    label="Input Audio to Transform",
-                    type="filepath",
-                    sources=["upload", "microphone"]
-                )
-                with gr.Row():
-                    voice_language = gr.Dropdown(
-                        choices=[
-                            ("🇺🇸 English", "en"),
-                            ("🇪🇸 Spanish", "es"),
-                            ("🇫🇷 French", "fr"),
-                            ("🇩🇪 German", "de"),
-                            ("🇮🇹 Italian", "it"),
-                            ("🇧🇷 Portuguese", "pt"),
-                            ("🇨🇳 Chinese", "zh"),
-                            ("🇯🇵 Japanese", "ja"),
-                            ("🇰🇷 Korean", "ko"),
-                            ("🇷🇺 Russian", "ru"),
-                            ("🇸🇦 Arabic", "ar"),
-                            ("🇮🇳 Hindi", "hi"),
-                            ("🇳🇱 Dutch", "nl"),
-                            ("🇵🇱 Polish", "pl"),
-                            ("🇹🇷 Turkish", "tr"),
-                            ("🇸🇪 Swedish", "sv"),
-                            ("🇫🇮 Finnish", "fi"),
-                            ("🇩🇰 Danish", "da"),
-                            ("🇳🇴 Norwegian", "no"),
-                            ("🇬🇷 Greek", "el"),
-                            ("🇮🇱 Hebrew", "he"),
-                            ("🇲🇾 Malay", "ms"),
-                            ("🇰🇪 Swahili", "sw")
-                        ],
-                        value="en",
-                        label="Language"
-                    )
-                    voice_exaggeration = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        step=0.1,
-                        value=0.5,
-                        label="🎭 Emotion Exaggeration"
-                    )
-                    voice_cfg = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        step=0.1,
-                        value=0.5,
-                        label="🎛️ CFG Scale"
-                    )
-                voice_clone_btn = gr.Button(
-                    "🎤 Clone Voice with Chatterbox",
-                    variant="primary",
-                    size="lg"
-                )
-            with gr.Column(scale=1):
-                # Text-to-Speech
-                gr.HTML("<h3 style='color: #8B5CF6;'>📝 Text-to-Speech Cloning</h3>")
-                text_input = gr.Textbox(
-                    label="Text to Convert to Speech",
-                    placeholder="Enter text to speak in the cloned voice...",
-                    lines=4,
-                    max_lines=8
-                )
-                with gr.Row():
-                    text_language = gr.Dropdown(
-                        choices=[
-                            ("🇺🇸 English", "en"),
-                            ("🇪🇸 Spanish", "es"),
-                            ("🇫🇷 French", "fr"),
-                            ("🇩🇪 German", "de"),
-                            ("🇮🇹 Italian", "it"),
-                            ("🇧🇷 Portuguese", "pt"),
-                            ("🇨🇳 Chinese", "zh"),
-                            ("🇯🇵 Japanese", "ja")
-                        ],
-                        value="en",
-                        label="Language"
-                    )
-                    text_exaggeration = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        step=0.1,
-                        value=0.5,
-                        label="🎭 Emotion Exaggeration"
-                    )
-                    text_cfg = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        step=0.1,
-                        value=0.5,
-                        label="🎛️ CFG Scale"
-                    )
-                text_clone_btn = gr.Button(
-                    "📝 Generate Speech with Chatterbox",
-                    variant="secondary",
-                    size="lg"
-                )
-        # Output Section
-        gr.HTML("<h3 style='color: #8B5CF6;'>🎵 Chatterbox Generated Audio</h3>")
-        with gr.Row():
             audio_output = gr.Audio(
                 label="Cloned Voice Result",
                 type="filepath"
             )
             status_output = gr.Textbox(
-                label="Processing Status",
-                lines=5,
                 interactive=False
             )
-        # Chatterbox Features
-        with gr.Accordion("🌟 Chatterbox Features", open=False):
-            gr.Markdown("""
-            ### Why Chatterbox is Special
-            **🎭 Emotion Exaggeration Control**
-            - First open source model with emotion control
-            - Adjust from monotone (0.0) to highly expressive (1.0)
-            - Perfect for creative content, games, and dramatic speech
-            **🌍 Multilingual Support (23 Languages)**
-            - Arabic, Chinese, Danish, Dutch, English, Finnish, French
-            - German, Greek, Hebrew, Hindi, Italian, Japanese, Korean
-            - Malay, Norwegian, Polish, Portuguese, Russian, Spanish
-            - Swedish, Swahili, Turkish
-            **⚡ Technical Advantages**
-            - 0.5B parameter Llama backbone
-            - Zero-shot voice cloning with 5+ seconds of audio
-            - Built-in neural watermarking for responsible AI
-            - MIT licensed - free for commercial use
-            - Consistently outperforms ElevenLabs in evaluations
-            **🎛️ Control Parameters**
-            - **Exaggeration**: Controls emotional intensity (0.0 = monotone, 1.0 = very expressive)
-            - **CFG Scale**: Controls adherence to reference voice (lower = more creative, higher = more accurate)
-            """)
-        # Event Handlers
-        voice_clone_btn.click(
-            fn=chatterbox_voice_clone,
-            inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
-            outputs=[audio_output, status_output],
-            show_progress=True
-        )
-        text_clone_btn.click(
-            fn=chatterbox_text_to_speech,
-            inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
-            outputs=[audio_output, status_output],
-            show_progress=True
-        )
-    return demo
 if __name__ == "__main__":
-    demo = create_chatterbox_interface()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

 import gradio as gr
 import torch
+import torchaudio
 import tempfile
 import os
+import logging
+import traceback
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Device detection with proper fallback
+DEVICE = "cpu"
+if torch.cuda.is_available():
+    DEVICE = "cuda"
+    logger.info("🚀 Running on CUDA GPU")
+elif torch.backends.mps.is_available():
+    DEVICE = "cpu"  # Force CPU for MPS compatibility
+    logger.info("🍎 Apple Silicon detected - using CPU mode for Chatterbox-TTS compatibility")
+else:
+    logger.info("🚀 Running on CPU")
+print(f"🚀 Running on device: {DEVICE}")
+# Patch torch.load to handle device mapping issues
+original_torch_load = torch.load
+def patched_torch_load(f, map_location=None, **kwargs):
+    """Patched torch.load that automatically maps CUDA tensors to CPU/MPS"""
+    if map_location is None:
+        map_location = 'cpu'  # Default to CPU for compatibility
+    logger.info(f"🔧 Loading with map_location={map_location}")
+    return original_torch_load(f, map_location=map_location, **kwargs)
+# Apply the patch
+torch.load = patched_torch_load
+# Global model variable
+MODEL = None
+def get_or_load_model():
+    """Loads the ChatterboxTTS model with proper error handling"""
+    global MODEL
+    if MODEL is None:
+        print("🔄 Model not loaded, initializing...")
         try:
+            # Try different import paths for chatterbox
+            try:
+                from chatterbox import ChatterboxTTS
+                MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
+                print("✅ Loaded with 'from chatterbox import ChatterboxTTS'")
+            except ImportError:
+                try:
+                    from chatterbox.tts import ChatterboxTTS
+                    MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
+                    print("✅ Loaded with 'from chatterbox.tts import ChatterboxTTS'")
+                except ImportError:
+                    try:
+                        from chatterbox.src.chatterbox.tts import ChatterboxTTS
+                        MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
+                        print("✅ Loaded with 'from chatterbox.src.chatterbox.tts import ChatterboxTTS'")
+                    except ImportError as e:
+                        print(f"❌ All Chatterbox import paths failed: {e}")
+                        return None
+            # Ensure model is on correct device
+            if hasattr(MODEL, 'to') and str(getattr(MODEL, 'device', 'unknown')) != DEVICE:
+                MODEL = MODEL.to(DEVICE)
+            print(f"✅ Model loaded successfully on device: {getattr(MODEL, 'device', 'N/A')}")
+            return MODEL
+        except Exception as e:
+            print(f"❌ Error loading Chatterbox model: {e}")
+            print(f"🔍 Full traceback: {traceback.format_exc()}")
+            return None
+    return MODEL
+def simple_voice_clone(reference_audio, input_text):
+    """Simplified voice cloning function with better error handling"""
     try:
         if not reference_audio:
             return None, "❌ Please upload reference audio!"
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
+        # Try to load model
+        model = get_or_load_model()
+        if model is None:
+            return None, "❌ Chatterbox model failed to load! Check logs for details."
         print(f"🎤 Generating speech with Chatterbox...")
+        print(f"📝 Text: {input_text[:100]}...")
         # Create output file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        # Generate speech using Chatterbox
+        try:
+            wav = model.generate(
                 input_text,
                 audio_prompt_path=reference_audio,
+                exaggeration=0.5,
+                cfg=0.5
             )
+            # Save generated audio
+            torchaudio.save(output_path, wav.cpu(), model.sr)
+            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                return output_path, f"✅ Chatterbox Voice Cloning Complete!\n📝 Generated: '{input_text[:100]}...'"
+            else:
+                return None, "❌ Generated audio file is empty!"
+        except Exception as gen_error:
+            print(f"❌ Generation error: {gen_error}")
+            return None, f"❌ Generation failed: {str(gen_error)}"
     except Exception as e:
+        print(f"❌ Voice cloning error: {e}")
+        return None, f"❌ Error: {str(e)}"
+# Attempt to load model at startup with better error reporting
+try:
+    startup_model = get_or_load_model()
+    if startup_model is not None:
+        models_loaded = True
+        startup_message = "✅ Chatterbox Models Loaded Successfully!"
+    else:
+        models_loaded = False
+        startup_message = "❌ Failed to Load Chatterbox Models - Check Dependencies"
+except Exception as startup_error:
+    models_loaded = False
+    startup_message = f"❌ Startup Error: {str(startup_error)}"
+    print(f"CRITICAL: {startup_message}")
 # Create Gradio interface
+with gr.Blocks(
+    title="🎭 Chatterbox Voice Cloning Studio",
+    theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
+) as demo:
+    # Header
+    gr.HTML("""
+    <div style="text-align: center; padding: 20px;">
+        <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Chatterbox Voice Cloning Studio</h1>
+        <p style="color: #666; font-size: 18px;">Powered by Resemble AI's Chatterbox Model</p>
+        <p style="color: #888; font-size: 14px;">Fixed version with proper device handling</p>
+    </div>
+    """)
+    # Model Status Display
+    status_color = "#d4edda" if models_loaded else "#f8d7da"
+    gr.HTML(f"""
+    <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
+        <strong>🤖 Chatterbox Status:</strong> {startup_message}
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            # Reference Voice
+            gr.HTML("<h3 style='color: #8B5CF6;'>🎤 Reference Voice</h3>")
+            reference_audio = gr.Audio(
+                label="Upload Reference Audio (5+ seconds)",
+                type="filepath",
+                sources=["upload", "microphone"]
+            )
+            # Text Input
+            gr.HTML("<h3 style='color: #8B5CF6;'>📝 Text to Convert</h3>")
+            text_input = gr.Textbox(
+                label="Enter Text",
+                placeholder="Enter the text you want to speak in the cloned voice...",
+                lines=4,
+                max_lines=8
+            )
+            # Generate Button
+            generate_btn = gr.Button(
+                "🎤 Generate Voice Clone",
+                variant="primary",
+                size="lg"
+            )
+        with gr.Column():
+            # Output
+            gr.HTML("<h3 style='color: #8B5CF6;'>🎵 Generated Audio</h3>")
             audio_output = gr.Audio(
                 label="Cloned Voice Result",
                 type="filepath"
             )
             status_output = gr.Textbox(
+                label="Status & Logs",
+                lines=6,
                 interactive=False
             )
+    # Troubleshooting Info
+    with gr.Accordion("🔧 Troubleshooting", open=False):
+        gr.Markdown("""
+        ### Common Issues & Solutions
+        **❌ "Models Not Loaded" Error:**
+        - Check that `chatterbox-tts` is installed: `pip install chatterbox-tts`
+        - Verify internet connection for model download
+        - Try restarting the space if models fail to load
+        **🔧 Device Issues:**
+        - This version forces CPU mode for compatibility
+        - CUDA tensors are automatically mapped to CPU
+        - Apple Silicon (MPS) falls back to CPU mode
+        **📦 Dependencies:**
+        - Ensure all requirements are installed correctly
+        - Check logs for specific import errors
+        - Model downloads may take several minutes on first run
+        **🎤 Audio Issues:**
+        - Use clear, high-quality reference audio (5+ seconds)
+        - Supported formats: WAV, MP3, FLAC, M4A
+        - Avoid background noise in reference audio
+        """)
+    # Event handler
+    generate_btn.click(
+        fn=simple_voice_clone,
+        inputs=[reference_audio, text_input],
+        outputs=[audio_output, status_output],
+        show_progress=True
+    )
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,