Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

5280410

verified ·

1 Parent(s): 30be8ed

Update app.py

Browse files

Files changed (1) hide show

app.py +283 -148

app.py CHANGED Viewed

@@ -4,82 +4,113 @@ import torchaudio
 import tempfile
 import os
 import logging
-import traceback
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Device detection with proper fallback
 DEVICE = "cpu"
 if torch.cuda.is_available():
     DEVICE = "cuda"
     logger.info("🚀 Running on CUDA GPU")
-elif torch.backends.mps.is_available():
-    DEVICE = "cpu"  # Force CPU for MPS compatibility
-    logger.info("🍎 Apple Silicon detected - using CPU mode for Chatterbox-TTS compatibility")
 else:
     logger.info("🚀 Running on CPU")
 print(f"🚀 Running on device: {DEVICE}")
-# Patch torch.load to handle device mapping issues
-original_torch_load = torch.load
-def patched_torch_load(f, map_location=None, **kwargs):
-    """Patched torch.load that automatically maps CUDA tensors to CPU/MPS"""
-    if map_location is None:
-        map_location = 'cpu'  # Default to CPU for compatibility
-    logger.info(f"🔧 Loading with map_location={map_location}")
-    return original_torch_load(f, map_location=map_location, **kwargs)
-# Apply the patch
-torch.load = patched_torch_load
-# Global model variable
-MODEL = None
-def get_or_load_model():
-    """Loads the ChatterboxTTS model with proper error handling"""
-    global MODEL
-    if MODEL is None:
-        print("🔄 Model not loaded, initializing...")
         try:
-            # Try different import paths for chatterbox
-            try:
-                from chatterbox import ChatterboxTTS
-                MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
-                print("✅ Loaded with 'from chatterbox import ChatterboxTTS'")
-            except ImportError:
-                try:
-                    from chatterbox.tts import ChatterboxTTS
-                    MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
-                    print("✅ Loaded with 'from chatterbox.tts import ChatterboxTTS'")
-                except ImportError:
-                    try:
-                        from chatterbox.src.chatterbox.tts import ChatterboxTTS
-                        MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
-                        print("✅ Loaded with 'from chatterbox.src.chatterbox.tts import ChatterboxTTS'")
-                    except ImportError as e:
-                        print(f"❌ All Chatterbox import paths failed: {e}")
-                        return None
-            # Ensure model is on correct device
-            if hasattr(MODEL, 'to') and str(getattr(MODEL, 'device', 'unknown')) != DEVICE:
-                MODEL = MODEL.to(DEVICE)
-            print(f"✅ Model loaded successfully on device: {getattr(MODEL, 'device', 'N/A')}")
-            return MODEL
         except Exception as e:
-            print(f"❌ Error loading Chatterbox model: {e}")
-            print(f"🔍 Full traceback: {traceback.format_exc()}")
-            return None
-    return MODEL
-def simple_voice_clone(reference_audio, input_text):
-    """Simplified voice cloning function with better error handling"""
     try:
         if not reference_audio:
             return None, "❌ Please upload reference audio!"
@@ -87,10 +118,8 @@ def simple_voice_clone(reference_audio, input_text):
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
-        # Try to load model
-        model = get_or_load_model()
-        if model is None:
-            return None, "❌ Chatterbox model failed to load! Check logs for details."
         print(f"🎤 Generating speech with Chatterbox...")
         print(f"📝 Text: {input_text[:100]}...")
@@ -99,61 +128,60 @@ def simple_voice_clone(reference_audio, input_text):
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
-        # Generate speech using Chatterbox
-        try:
             wav = model.generate(
                 input_text,
                 audio_prompt_path=reference_audio,
-                exaggeration=0.5,
-                cfg=0.5
             )
-            # Save generated audio
-            torchaudio.save(output_path, wav.cpu(), model.sr)
-            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-                return output_path, f"✅ Chatterbox Voice Cloning Complete!\n📝 Generated: '{input_text[:100]}...'"
-            else:
-                return None, "❌ Generated audio file is empty!"
-        except Exception as gen_error:
-            print(f"❌ Generation error: {gen_error}")
-            return None, f"❌ Generation failed: {str(gen_error)}"
     except Exception as e:
-        print(f"❌ Voice cloning error: {e}")
-        return None, f"❌ Error: {str(e)}"
-# Attempt to load model at startup with better error reporting
 try:
-    startup_model = get_or_load_model()
-    if startup_model is not None:
-        models_loaded = True
-        startup_message = "✅ Chatterbox Models Loaded Successfully!"
-    else:
-        models_loaded = False
-        startup_message = "❌ Failed to Load Chatterbox Models - Check Dependencies"
-except Exception as startup_error:
     models_loaded = False
-    startup_message = f"❌ Startup Error: {str(startup_error)}"
-    print(f"CRITICAL: {startup_message}")
 # Create Gradio interface
 with gr.Blocks(
-    title="🎭 Chatterbox Voice Cloning Studio",
     theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
 ) as demo:
     # Header
     gr.HTML("""
     <div style="text-align: center; padding: 20px;">
-        <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Chatterbox Voice Cloning Studio</h1>
-        <p style="color: #666; font-size: 18px;">Powered by Resemble AI's Chatterbox Model</p>
-        <p style="color: #888; font-size: 14px;">Fixed version with proper device handling</p>
     </div>
     """)
-    # Model Status Display
     status_color = "#d4edda" if models_loaded else "#f8d7da"
     gr.HTML(f"""
     <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
@@ -163,74 +191,181 @@ with gr.Blocks(
     with gr.Row():
         with gr.Column():
-            # Reference Voice
-            gr.HTML("<h3 style='color: #8B5CF6;'>🎤 Reference Voice</h3>")
             reference_audio = gr.Audio(
                 label="Upload Reference Audio (5+ seconds)",
                 type="filepath",
                 sources=["upload", "microphone"]
             )
-            # Text Input
-            gr.HTML("<h3 style='color: #8B5CF6;'>📝 Text to Convert</h3>")
-            text_input = gr.Textbox(
-                label="Enter Text",
-                placeholder="Enter the text you want to speak in the cloned voice...",
-                lines=4,
-                max_lines=8
-            )
-            # Generate Button
-            generate_btn = gr.Button(
-                "🎤 Generate Voice Clone",
-                variant="primary",
-                size="lg"
-            )
-        with gr.Column():
-            # Output
-            gr.HTML("<h3 style='color: #8B5CF6;'>🎵 Generated Audio</h3>")
-            audio_output = gr.Audio(
-                label="Cloned Voice Result",
-                type="filepath"
-            )
-            status_output = gr.Textbox(
-                label="Status & Logs",
-                lines=6,
-                interactive=False
-            )
-    # Troubleshooting Info
-    with gr.Accordion("🔧 Troubleshooting", open=False):
         gr.Markdown("""
-        ### Common Issues & Solutions
-        **❌ "Models Not Loaded" Error:**
-        - Check that `chatterbox-tts` is installed: `pip install chatterbox-tts`
-        - Verify internet connection for model download
-        - Try restarting the space if models fail to load
-        **🔧 Device Issues:**
-        - This version forces CPU mode for compatibility
-        - CUDA tensors are automatically mapped to CPU
-        - Apple Silicon (MPS) falls back to CPU mode
-        **📦 Dependencies:**
-        - Ensure all requirements are installed correctly
-        - Check logs for specific import errors
-        - Model downloads may take several minutes on first run
-        **🎤 Audio Issues:**
-        - Use clear, high-quality reference audio (5+ seconds)
-        - Supported formats: WAV, MP3, FLAC, M4A
-        - Avoid background noise in reference audio
         """)
-    # Event handler
-    generate_btn.click(
-        fn=simple_voice_clone,
-        inputs=[reference_audio, text_input],
         outputs=[audio_output, status_output],
         show_progress=True
     )

 import tempfile
 import os
 import logging
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Device detection
 DEVICE = "cpu"
 if torch.cuda.is_available():
     DEVICE = "cuda"
     logger.info("🚀 Running on CUDA GPU")
 else:
     logger.info("🚀 Running on CPU")
 print(f"🚀 Running on device: {DEVICE}")
+# Global models
+ENGLISH_MODEL = None
+MULTILINGUAL_MODEL = None
+def load_chatterbox_models():
+    """Load Chatterbox models with proper error handling"""
+    global ENGLISH_MODEL, MULTILINGUAL_MODEL
+    if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
         try:
+            from chatterbox.tts import ChatterboxTTS
+            from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+            print("🔄 Loading Chatterbox English model...")
+            ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
+            print("✅ English model loaded!")
+            print("🔄 Loading Chatterbox Multilingual model...")
+            MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
+            print("✅ Multilingual model loaded!")
+            return True
         except Exception as e:
+            print(f"❌ Error loading Chatterbox models: {e}")
+            return False
+    return True
+def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
+    """
+    Voice-to-Voice Cloning: Transform input audio using reference voice
+    """
+    try:
+        if not reference_audio:
+            return None, "❌ Please upload reference audio (voice to clone)!"
+        if not input_audio:
+            return None, "❌ Please upload input audio (content to transform)!"
+        if not load_chatterbox_models():
+            return None, "❌ Chatterbox models failed to load!"
+        # Extract text from input audio using Whisper (for content)
+        try:
+            import whisper
+            whisper_model = whisper.load_model("base")
+            result = whisper_model.transcribe(input_audio)
+            extracted_text = result["text"]
+            print(f"📝 Extracted text from input audio: {extracted_text}")
+        except Exception as e:
+            print(f"⚠️ Whisper transcription failed: {e}")
+            extracted_text = "Voice cloning demonstration using the uploaded audio content."
+        # Create output file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            output_path = tmp_file.name
+        # Use appropriate model based on language
+        if language == "en":
+            model = ENGLISH_MODEL
+            wav = model.generate(
+                extracted_text,
+                audio_prompt_path=reference_audio,
+                exaggeration=exaggeration,
+                cfg=cfg
+            )
+        else:
+            model = MULTILINGUAL_MODEL
+            wav = model.generate(
+                extracted_text,
+                audio_prompt_path=reference_audio,
+                language_id=language,
+                exaggeration=exaggeration,
+                cfg=cfg
+            )
+        # Save generated audio
+        torchaudio.save(output_path, wav.cpu(), model.sr)
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ Voice-to-Voice Cloning Complete!\n🎤 Reference voice applied to: '{extracted_text[:100]}...'\n🎛️ Settings: Exaggeration={exaggeration}, CFG={cfg}"
+        else:
+            return None, "❌ Generated audio file is empty!"
+    except Exception as e:
+        return None, f"❌ Voice-to-Voice cloning error: {str(e)}"
+def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5, speed=1.0):
+    """
+    Text-to-Voice Cloning: Generate speech from text using reference voice
+    """
     try:
         if not reference_audio:
             return None, "❌ Please upload reference audio!"
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
+        if not load_chatterbox_models():
+            return None, "❌ Chatterbox models failed to load!"
         print(f"🎤 Generating speech with Chatterbox...")
         print(f"📝 Text: {input_text[:100]}...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        # Use appropriate model based on language
+        if language == "en":
+            model = ENGLISH_MODEL
             wav = model.generate(
                 input_text,
                 audio_prompt_path=reference_audio,
+                exaggeration=exaggeration,
+                cfg=cfg
             )
+        else:
+            model = MULTILINGUAL_MODEL
+            wav = model.generate(
+                input_text,
+                audio_prompt_path=reference_audio,
+                language_id=language,
+                exaggeration=exaggeration,
+                cfg=cfg
+            )
+        # Save generated audio
+        torchaudio.save(output_path, wav.cpu(), model.sr)
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ Text-to-Voice Cloning Complete!\n📝 Generated: '{input_text[:100]}...'\n🎛️ Settings: Exaggeration={exaggeration}, CFG={cfg}"
+        else:
+            return None, "❌ Generated audio file is empty!"
     except Exception as e:
+        return None, f"❌ Text-to-Voice cloning error: {str(e)}"
+# Try to load models at startup
 try:
+    models_loaded = load_chatterbox_models()
+    startup_message = "✅ Chatterbox Models Loaded Successfully!" if models_loaded else "❌ Failed to Load Chatterbox Models"
+except Exception as e:
     models_loaded = False
+    startup_message = f"❌ Startup Error: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(
+    title="🎭 Complete Chatterbox Voice Cloning Studio",
     theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
 ) as demo:
     # Header
     gr.HTML("""
     <div style="text-align: center; padding: 20px;">
+        <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Complete Chatterbox Voice Cloning Studio</h1>
+        <p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Emotion Control</p>
+        <p style="color: #888; font-size: 14px;">Powered by Resemble AI's Chatterbox - The Model We Discussed!</p>
     </div>
     """)
+    # Model Status
     status_color = "#d4edda" if models_loaded else "#f8d7da"
     gr.HTML(f"""
     <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
     with gr.Row():
         with gr.Column():
+            # Reference Voice Section
+            gr.HTML("<h3 style='color: #8B5CF6;'>🎤 Reference Voice (Voice to Clone)</h3>")
             reference_audio = gr.Audio(
                 label="Upload Reference Audio (5+ seconds)",
                 type="filepath",
                 sources=["upload", "microphone"]
             )
+            gr.HTML("<p style='color: #666; font-size: 14px;'>📌 This is the voice that will be cloned and applied to your content</p>")
+    # Tabs for different input methods
+    with gr.Tabs():
+        # Tab 1: Voice-to-Voice Cloning
+        with gr.TabItem("🎵 Voice-to-Voice Cloning"):
+            gr.HTML("<p style='margin-bottom: 15px;'>Upload audio content and transform it using the reference voice</p>")
+            with gr.Row():
+                with gr.Column():
+                    input_audio = gr.Audio(
+                        label="Input Audio (Content to Transform)",
+                        type="filepath",
+                        sources=["upload", "microphone"]
+                    )
+                    with gr.Row():
+                        voice_language = gr.Dropdown(
+                            choices=[
+                                ("🇺🇸 English", "en"),
+                                ("🇪🇸 Spanish", "es"),
+                                ("🇫🇷 French", "fr"),
+                                ("🇩🇪 German", "de"),
+                                ("🇮🇹 Italian", "it"),
+                                ("🇧🇷 Portuguese", "pt"),
+                                ("🇨🇳 Chinese", "zh"),
+                                ("🇯🇵 Japanese", "ja"),
+                                ("🇰🇷 Korean", "ko"),
+                                ("🇷🇺 Russian", "ru")
+                            ],
+                            value="en",
+                            label="Language"
+                        )
+                        voice_exaggeration = gr.Slider(
+                            minimum=0.0,
+                            maximum=2.0,
+                            step=0.1,
+                            value=0.5,
+                            label="🎭 Emotion Exaggeration"
+                        )
+                        voice_cfg = gr.Slider(
+                            minimum=0.2,
+                            maximum=1.0,
+                            step=0.1,
+                            value=0.5,
+                            label="🎛️ CFG Scale"
+                        )
+                    voice_clone_btn = gr.Button(
+                        "🎤 Transform Voice (Audio → Cloned Audio)",
+                        variant="primary",
+                        size="lg"
+                    )
+        # Tab 2: Text-to-Voice Cloning
+        with gr.TabItem("📝 Text-to-Speech Cloning"):
+            gr.HTML("<p style='margin-bottom: 15px;'>Enter text and generate speech using the reference voice</p>")
+            with gr.Row():
+                with gr.Column():
+                    text_input = gr.Textbox(
+                        label="Text to Convert to Speech",
+                        placeholder="Enter the text you want to speak in the cloned voice...",
+                        lines=4,
+                        max_lines=8
+                    )
+                    with gr.Row():
+                        text_language = gr.Dropdown(
+                            choices=[
+                                ("🇺🇸 English", "en"),
+                                ("🇪🇸 Spanish", "es"),
+                                ("🇫🇷 French", "fr"),
+                                ("🇩🇪 German", "de"),
+                                ("🇮🇹 Italian", "it"),
+                                ("🇧🇷 Portuguese", "pt"),
+                                ("🇨🇳 Chinese", "zh"),
+                                ("🇯🇵 Japanese", "ja")
+                            ],
+                            value="en",
+                            label="Language"
+                        )
+                        text_exaggeration = gr.Slider(
+                            minimum=0.0,
+                            maximum=2.0,
+                            step=0.1,
+                            value=0.5,
+                            label="🎭 Emotion Exaggeration"
+                        )
+                        text_cfg = gr.Slider(
+                            minimum=0.2,
+                            maximum=1.0,
+                            step=0.1,
+                            value=0.5,
+                            label="🎛️ CFG Scale"
+                        )
+                    text_clone_btn = gr.Button(
+                        "📝 Generate Speech (Text → Cloned Audio)",
+                        variant="secondary",
+                        size="lg"
+                    )
+    # Output Section
+    gr.HTML("<h3 style='color: #8B5CF6;'>🎵 Generated Audio Output</h3>")
+    with gr.Row():
+        audio_output = gr.Audio(
+            label="Cloned Voice Result",
+            type="filepath"
+        )
+        status_output = gr.Textbox(
+            label="Processing Status & Details",
+            lines=6,
+            interactive=False
+        )
+    # Examples Section
+    with gr.Accordion("💡 Example Texts for Testing", open=False):
+        examples = [
+            "Hello, this is a demonstration of real voice cloning technology using Chatterbox.",
+            "The weather is beautiful today, perfect for a walk in the park with friends.",
+            "Artificial intelligence is revolutionizing how we create and interact with digital content.",
+            "This advanced voice cloning system can generate natural speech in multiple languages."
+        ]
+        gr.Examples(
+            examples=examples,
+            inputs=text_input,
+            label="Click to try these example texts:"
+        )
+    # How It Works Section
+    with gr.Accordion("🔍 How Voice Cloning Works", open=False):
         gr.Markdown("""
+        ### Voice-to-Voice Cloning Process
+        1. **🎤 Upload Reference Voice**: The voice you want to clone (5+ seconds)
+        2. **📥 Upload Input Audio**: Audio content you want to transform
+        3. **🧠 Content Extraction**: AI extracts speech content from input audio
+        4. **🎭 Voice Application**: Reference voice characteristics applied to content
+        5. **🎵 Generate Output**: New audio with original content in cloned voice
+        ### Text-to-Speech Process
+        1. **🎤 Upload Reference Voice**: The voice you want to clone
+        2. **📝 Enter Text**: Type the content to convert to speech
+        3. **🎛️ Adjust Controls**: Set emotion and speech parameters
+        4. **🎵 Generate Speech**: Create natural speech in the cloned voice
+        ### Chatterbox Controls
+        - **Emotion Exaggeration**: 0.0 = monotone, 2.0 = very expressive
+        - **CFG Scale**: 0.2 = creative, 1.0 = accurate to reference
+        - **Language Support**: 23+ languages with multilingual model
         """)
+    # Event Handlers
+    voice_clone_btn.click(
+        fn=voice_to_voice_cloning,
+        inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
+        outputs=[audio_output, status_output],
+        show_progress=True
+    )
+    text_clone_btn.click(
+        fn=text_to_voice_cloning,
+        inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
         outputs=[audio_output, status_output],
         show_progress=True
     )