Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

e6e0279

verified ·

1 Parent(s): 187313d

Update app.py

Browse files

Files changed (1) hide show

app.py +269 -258

app.py CHANGED Viewed

@@ -1,62 +1,90 @@
 import gradio as gr
 import torch
-import numpy as np
-import soundfile as sf
 import tempfile
 import os
-from scipy.io import wavfile
-import librosa
-def extract_audio_features(audio_path):
-    """Extract features from audio for voice cloning"""
-    try:
-        # Load audio file
-        audio, sr = librosa.load(audio_path, sr=16000)
-        return audio, sr
-    except Exception as e:
-        print(f"Error processing audio: {e}")
-        return None, None
-def voice_clone_with_audio(reference_audio, input_audio, enhance_quality=True):
     """
-    Voice-to-Voice cloning: Clone reference voice using input audio
     """
     try:
-        if not reference_audio:
-            return None, "❌ Please upload reference audio!"
-        if not input_audio:
-            return None, "❌ Please upload input audio to transform!"
-        # Process reference audio
-        ref_audio, ref_sr = extract_audio_features(reference_audio)
-        if ref_audio is None:
-            return None, "❌ Error processing reference audio!"
-        # Process input audio
-        input_audio_data, input_sr = extract_audio_features(input_audio)
-        if input_audio_data is None:
-            return None, "❌ Error processing input audio!"
-        # For demo: Apply simple voice transformation
-        # In production, this would use actual voice cloning models
-        transformed_audio = apply_voice_transformation(
-            reference_audio=ref_audio,
-            input_audio=input_audio_data,
-            enhance_quality=enhance_quality
-        )
-        # Save output audio
-        output_path = save_audio_output(transformed_audio, ref_sr)
-        return output_path, f"✅ Voice cloning complete!\n🎵 Transformed {len(input_audio_data)/input_sr:.1f}s of audio using reference voice"
     except Exception as e:
-        return None, f"❌ Error in voice cloning: {str(e)}"
-def voice_clone_with_text(reference_audio, input_text, language="en", speed=1.0):
     """
-    Text-to-Voice cloning: Generate speech from text using reference voice
     """
     try:
         if not reference_audio:
@@ -65,272 +93,255 @@ def voice_clone_with_text(reference_audio, input_text, language="en", speed=1.0)
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
-        # Process reference audio
-        ref_audio, ref_sr = extract_audio_features(reference_audio)
-        if ref_audio is None:
-            return None, "❌ Error processing reference audio!"
-        # Generate speech from text (demo implementation)
-        generated_audio = text_to_speech_with_voice(
-            text=input_text,
-            reference_voice=ref_audio,
-            language=language,
-            speed=speed
-        )
-        # Save output audio
-        output_path = save_audio_output(generated_audio, ref_sr)
-        return output_path, f"✅ Text-to-speech complete!\n📝 Generated speech for: '{input_text[:100]}{'...' if len(input_text) > 100 else ''}'"
-    except Exception as e:
-        return None, f"❌ Error in text-to-speech: {str(e)}"
-def apply_voice_transformation(reference_audio, input_audio, enhance_quality=True):
-    """
-    Apply voice transformation (demo implementation)
-    In production, this would use models like XTTS, OpenVoice, etc.
-    """
-    # Demo: Simple pitch and tone adjustment
-    # This is a placeholder - replace with actual voice cloning model
-    # Normalize audio lengths
-    min_length = min(len(reference_audio), len(input_audio))
-    if min_length > 0:
-        # Simple blending for demo (not real voice cloning)
-        alpha = 0.7  # Weight for input audio
-        beta = 0.3   # Weight for reference characteristics
-        # Resize to same length
-        ref_segment = reference_audio[:min_length]
-        input_segment = input_audio[:min_length]
-        # Simple transformation (placeholder)
-        transformed = alpha * input_segment + beta * ref_segment
-        # Apply enhancement if requested
-        if enhance_quality:
-            transformed = enhance_audio_quality(transformed)
-        return transformed
-    else:
-        return input_audio
-def text_to_speech_with_voice(text, reference_voice, language="en", speed=1.0):
-    """
-    Generate speech from text using reference voice characteristics
-    In production, this would use TTS models with voice cloning
-    """
-    # Demo: Generate simple synthetic speech
-    # This is a placeholder - replace with actual TTS model
-    duration = len(text) * 0.1 * speed  # Rough duration estimate
-    sr = 16000
-    samples = int(duration * sr)
-    # Generate simple sine wave pattern (placeholder)
-    t = np.linspace(0, duration, samples)
-    frequency = 200 + np.mean(np.abs(reference_voice)) * 100  # Use ref voice characteristics
-    synthetic_speech = 0.3 * np.sin(2 * np.pi * frequency * t)
-    # Add some variation based on text length
-    for i, char in enumerate(text[:10]):
-        freq_mod = 200 + ord(char) % 100
-        synthetic_speech += 0.1 * np.sin(2 * np.pi * freq_mod * t)
-    return synthetic_speech[:samples]
-def enhance_audio_quality(audio):
-    """Apply audio enhancement"""
-    # Simple noise reduction and normalization
-    audio = audio / np.max(np.abs(audio))  # Normalize
-    audio = audio * 0.8  # Reduce volume slightly
-    return audio
-def save_audio_output(audio_data, sample_rate):
-    """Save audio data to temporary file"""
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-        output_path = tmp_file.name
-    # Ensure audio is in correct format
-    audio_data = np.array(audio_data, dtype=np.float32)
-    # Save using soundfile
-    sf.write(output_path, audio_data, sample_rate)
-    return output_path
-# Create Gradio interface with tabs
-def create_interface():
     with gr.Blocks(
-        title="🎭 Voice Cloning Studio",
-        theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
     ) as demo:
         # Header
         gr.HTML("""
         <div style="text-align: center; padding: 20px;">
-            <h1 style="color: #2E86AB; margin-bottom: 10px;">🎭 AI Voice Cloning Studio</h1>
-            <p style="color: #666; font-size: 18px;">Clone any voice with AI technology - Support for both Audio and Text input</p>
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 # Reference Voice Section
-                gr.HTML("<h3 style='color: #2E86AB;'>🎤 Upload Reference Voice</h3>")
                 reference_audio = gr.Audio(
-                    label="Reference Audio (10+ seconds recommended)",
                     type="filepath",
                     sources=["upload", "microphone"]
                 )
-                gr.HTML("<p style='color: #666; font-size: 14px;'>This is the voice you want to clone. Upload clear, high-quality audio.</p>")
             with gr.Column(scale=1):
-                # Input Method Selection
-                gr.HTML("<h3 style='color: #2E86AB;'>📥 Choose Input Method</h3>")
-                with gr.Tabs():
-                    with gr.TabItem("🎵 Audio Input"):
-                        gr.HTML("<p>Upload audio to transform into the reference voice</p>")
-                        input_audio = gr.Audio(
-                            label="Input Audio to Transform",
-                            type="filepath",
-                            sources=["upload", "microphone"]
-                        )
-                        enhance_audio = gr.Checkbox(
-                            label="🎚️ Enhance Audio Quality",
-                            value=True
-                        )
-                        audio_clone_btn = gr.Button(
-                            "🎤 Clone Voice from Audio",
-                            variant="primary",
-                            size="lg"
-                        )
-                    with gr.TabItem("📝 Text Input"):
-                        gr.HTML("<p>Enter text to speak in the reference voice</p>")
-                        text_input = gr.Textbox(
-                            label="Text to Convert",
-                            placeholder="Enter the text you want to speak in the cloned voice...",
-                            lines=4,
-                            max_lines=6
-                        )
-                        with gr.Row():
-                            language_select = gr.Dropdown(
-                                choices=[
-                                    ("🇺🇸 English", "en"),
-                                    ("🇪🇸 Spanish", "es"),
-                                    ("🇫🇷 French", "fr"),
-                                    ("🇩🇪 German", "de"),
-                                    ("🇮🇹 Italian", "it"),
-                                    ("🇧🇷 Portuguese", "pt"),
-                                    ("🇨🇳 Chinese", "zh"),
-                                    ("🇯🇵 Japanese", "ja")
-                                ],
-                                value="en",
-                                label="Language"
-                            )
-                            speed_control = gr.Slider(
-                                minimum=0.5,
-                                maximum=2.0,
-                                step=0.1,
-                                value=1.0,
-                                label="Speech Speed"
-                            )
-                        text_clone_btn = gr.Button(
-                            "📝 Generate Speech from Text",
-                            variant="secondary",
-                            size="lg"
-                        )
-        # Output Section
-        with gr.Row():
-            with gr.Column():
-                gr.HTML("<h3 style='color: #2E86AB;'>🎵 Cloned Voice Output</h3>")
-                audio_output = gr.Audio(
-                    label="Generated Audio",
-                    type="filepath"
                 )
-                status_output = gr.Textbox(
-                    label="Status",
-                    lines=3,
-                    interactive=False
                 )
-        # Examples Section
-        with gr.Accordion("💡 Example Texts", open=False):
-            examples = [
-                "Hello, this is a demonstration of AI voice cloning technology.",
-                "Welcome to the future of artificial intelligence and speech synthesis.",
-                "This voice was generated using advanced machine learning models.",
-                "Experience the power of AI-driven voice generation with natural speech patterns."
-            ]
-            gr.Examples(
-                examples=examples,
-                inputs=text_input,
-                label="Click to try these examples:"
             )
-        # How it works section
-        with gr.Accordion("🔍 How Voice Cloning Works", open=False):
             gr.Markdown("""
-            ### Voice-to-Voice Cloning Process
-            1. **🎤 Reference Voice**: Upload 10+ seconds of clear speech
-            2. **📥 Input Audio**: Upload audio you want to transform
-            3. **🧠 AI Analysis**: Extract voice characteristics and features
-            4. **🎵 Voice Synthesis**: Apply reference voice to input content
-            ### Text-to-Speech Process
-            1. **🎤 Reference Voice**: Upload voice sample to clone
-            2. **📝 Text Input**: Enter text to convert to speech
-            3. **🗣️ Speech Generation**: Generate speech in the cloned voice
-            4. **🎵 Audio Output**: Download your cloned speech
-            ### Tips for Best Results
-            - **Reference Audio**: Use 10+ seconds of clear, single-speaker audio
-            - **Input Audio**: Ensure good quality with minimal background noise
-            - **Language**: Match reference voice language when possible
-            - **Length**: Shorter inputs (under 30 seconds) work better
             """)
-        # Event handlers
-        audio_clone_btn.click(
-            fn=voice_clone_with_audio,
-            inputs=[reference_audio, input_audio, enhance_audio],
             outputs=[audio_output, status_output],
             show_progress=True
         )
         text_clone_btn.click(
-            fn=voice_clone_with_text,
-            inputs=[reference_audio, text_input, language_select, speed_control],
-            outputs=[audio_output, status_output],
-            show_progress=True
-        )
-        # Auto-generate on Enter for text
-        text_input.submit(
-            fn=voice_clone_with_text,
-            inputs=[reference_audio, text_input, language_select, speed_control],
             outputs=[audio_output, status_output],
             show_progress=True
         )
     return demo
-# Launch the app
 if __name__ == "__main__":
-    demo = create_interface()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

 import gradio as gr
 import torch
+import torchaudio as ta
 import tempfile
 import os
+from chatterbox.tts import ChatterboxTTS
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+# Initialize Chatterbox models (the ones we actually discussed!)
+print("🔄 Loading Chatterbox TTS models...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+try:
+    # Load Chatterbox English model
+    english_model = ChatterboxTTS.from_pretrained(device=device)
+    print("✅ Chatterbox English model loaded!")
+    # Load Chatterbox Multilingual model
+    multilingual_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
+    print("✅ Chatterbox Multilingual model loaded!")
+    models_loaded = True
+except Exception as e:
+    print(f"❌ Error loading Chatterbox models: {e}")
+    english_model = None
+    multilingual_model = None
+    models_loaded = False
+def chatterbox_voice_clone(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
     """
+    Real Voice-to-Voice cloning using Chatterbox (the model we discussed!)
     """
     try:
+        if not reference_audio or not input_audio:
+            return None, "❌ Please upload both reference and input audio files!"
+        if not models_loaded:
+            return None, "❌ Chatterbox models not loaded!"
+        # Extract text from input audio using Whisper
+        import whisper
+        try:
+            whisper_model = whisper.load_model("base")
+            result = whisper_model.transcribe(input_audio)
+            input_text = result["text"]
+            print(f"📝 Extracted text: {input_text}")
+        except Exception as e:
+            input_text = "Voice cloning demonstration using Chatterbox AI technology."
+            print(f"⚠️ Whisper failed, using default text: {e}")
+        # Create output file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            output_path = tmp_file.name
+        # Use appropriate Chatterbox model based on language
+        if language == "en":
+            # Use English Chatterbox model
+            wav = english_model.generate(
+                input_text,
+                audio_prompt_path=reference_audio,
+                exaggeration=exaggeration,
+                cfg=cfg
+            )
+        else:
+            # Use Multilingual Chatterbox model
+            wav = multilingual_model.generate(
+                input_text,
+                audio_prompt_path=reference_audio,
+                language_id=language,
+                exaggeration=exaggeration,
+                cfg=cfg
+            )
+        # Save generated audio
+        ta.save(output_path, wav, english_model.sr if language == "en" else multilingual_model.sr)
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ Chatterbox Voice Cloning Complete!\n🎵 Generated: '{input_text[:100]}...'\n🎛️ Settings: Exaggeration={exaggeration}, CFG={cfg}"
+        else:
+            return None, "❌ Failed to generate cloned audio!"
     except Exception as e:
+        return None, f"❌ Chatterbox Error: {str(e)}"
+def chatterbox_text_to_speech(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5, speed=1.0):
     """
+    Real Text-to-Speech with voice cloning using Chatterbox
     """
     try:
         if not reference_audio:
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
+        if not models_loaded:
+            return None, "❌ Chatterbox models not loaded!"
+        print(f"🎤 Generating speech with Chatterbox...")
+        print(f"📝 Text: {input_text}")
+        print(f"🗣️ Language: {language}")
+        print(f"🎛️ Exaggeration: {exaggeration}, CFG: {cfg}")
+        # Create output file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            output_path = tmp_file.name
+        # Use appropriate Chatterbox model
+        if language == "en":
+            # English Chatterbox model
+            wav = english_model.generate(
+                input_text,
+                audio_prompt_path=reference_audio,
+                exaggeration=exaggeration,
+                cfg=cfg
+            )
+        else:
+            # Multilingual Chatterbox model
+            wav = multilingual_model.generate(
+                input_text,
+                audio_prompt_path=reference_audio,
+                language_id=language,
+                exaggeration=exaggeration,
+                cfg=cfg
+            )
+        # Save generated audio
+        ta.save(output_path, wav, english_model.sr if language == "en" else multilingual_model.sr)
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ Chatterbox TTS Complete!\n📝 Generated: '{input_text[:100]}...'\n🎛️ Settings: Exaggeration={exaggeration}, CFG={cfg}"
+        else:
+            return None, "❌ Failed to generate speech!"
+    except Exception as e:
+        return None, f"❌ Chatterbox Error: {str(e)}"
+# Create Gradio interface
+def create_chatterbox_interface():
     with gr.Blocks(
+        title="🎭 Chatterbox Voice Cloning Studio",
+        theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
     ) as demo:
         # Header
         gr.HTML("""
         <div style="text-align: center; padding: 20px;">
+            <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Chatterbox Voice Cloning Studio</h1>
+            <p style="color: #666; font-size: 18px;">Powered by Resemble AI's Chatterbox - The Model We Discussed!</p>
+            <p style="color: #888; font-size: 14px;">✨ Emotion control • 23+ languages • Zero-shot cloning • MIT licensed</p>
+        </div>
+        """)
+        # Model Status
+        gr.HTML(f"""
+        <div style="text-align: center; padding: 10px; background: {'#d4edda' if models_loaded else '#f8d7da'}; border-radius: 10px; margin-bottom: 20px;">
+            <strong>🤖 Chatterbox Status:</strong> {'✅ Models Loaded Successfully!' if models_loaded else '❌ Models Not Loaded'}
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 # Reference Voice Section
+                gr.HTML("<h3 style='color: #8B5CF6;'>🎤 Reference Voice (5+ seconds)</h3>")
                 reference_audio = gr.Audio(
+                    label="Upload Reference Audio",
                     type="filepath",
                     sources=["upload", "microphone"]
                 )
+                gr.HTML("<p style='color: #666; font-size: 14px;'>📌 Upload clear speech from the voice you want to clone</p>")
+        with gr.Row():
             with gr.Column(scale=1):
+                # Voice-to-Voice Cloning
+                gr.HTML("<h3 style='color: #8B5CF6;'>🎵 Voice-to-Voice Cloning</h3>")
+                input_audio = gr.Audio(
+                    label="Input Audio to Transform",
+                    type="filepath",
+                    sources=["upload", "microphone"]
+                )
+                with gr.Row():
+                    voice_language = gr.Dropdown(
+                        choices=[
+                            ("🇺🇸 English", "en"),
+                            ("🇪🇸 Spanish", "es"),
+                            ("🇫🇷 French", "fr"),
+                            ("🇩🇪 German", "de"),
+                            ("🇮🇹 Italian", "it"),
+                            ("🇧🇷 Portuguese", "pt"),
+                            ("🇨🇳 Chinese", "zh"),
+                            ("🇯🇵 Japanese", "ja"),
+                            ("🇰🇷 Korean", "ko"),
+                            ("🇷🇺 Russian", "ru"),
+                            ("🇸🇦 Arabic", "ar"),
+                            ("🇮🇳 Hindi", "hi"),
+                            ("🇳🇱 Dutch", "nl"),
+                            ("🇵🇱 Polish", "pl"),
+                            ("🇹🇷 Turkish", "tr"),
+                            ("🇸🇪 Swedish", "sv"),
+                            ("🇫🇮 Finnish", "fi"),
+                            ("🇩🇰 Danish", "da"),
+                            ("🇳🇴 Norwegian", "no"),
+                            ("🇬🇷 Greek", "el"),
+                            ("🇮🇱 Hebrew", "he"),
+                            ("🇲🇾 Malay", "ms"),
+                            ("🇰🇪 Swahili", "sw")
+                        ],
+                        value="en",
+                        label="Language"
+                    )
+                    voice_exaggeration = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                        value=0.5,
+                        label="🎭 Emotion Exaggeration"
+                    )
+                    voice_cfg = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                        value=0.5,
+                        label="🎛️ CFG Scale"
+                    )
+                voice_clone_btn = gr.Button(
+                    "🎤 Clone Voice with Chatterbox",
+                    variant="primary",
+                    size="lg"
                 )
+            with gr.Column(scale=1):
+                # Text-to-Speech
+                gr.HTML("<h3 style='color: #8B5CF6;'>📝 Text-to-Speech Cloning</h3>")
+                text_input = gr.Textbox(
+                    label="Text to Convert to Speech",
+                    placeholder="Enter text to speak in the cloned voice...",
+                    lines=4,
+                    max_lines=8
+                )
+                with gr.Row():
+                    text_language = gr.Dropdown(
+                        choices=[
+                            ("🇺🇸 English", "en"),
+                            ("🇪🇸 Spanish", "es"),
+                            ("🇫🇷 French", "fr"),
+                            ("🇩🇪 German", "de"),
+                            ("🇮🇹 Italian", "it"),
+                            ("🇧�� Portuguese", "pt"),
+                            ("🇨🇳 Chinese", "zh"),
+                            ("🇯🇵 Japanese", "ja")
+                        ],
+                        value="en",
+                        label="Language"
+                    )
+                    text_exaggeration = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                        value=0.5,
+                        label="🎭 Emotion Exaggeration"
+                    )
+                    text_cfg = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                        value=0.5,
+                        label="🎛️ CFG Scale"
+                    )
+                text_clone_btn = gr.Button(
+                    "📝 Generate Speech with Chatterbox",
+                    variant="secondary",
+                    size="lg"
                 )
+        # Output Section
+        gr.HTML("<h3 style='color: #8B5CF6;'>🎵 Chatterbox Generated Audio</h3>")
+        with gr.Row():
+            audio_output = gr.Audio(
+                label="Cloned Voice Result",
+                type="filepath"
+            )
+            status_output = gr.Textbox(
+                label="Processing Status",
+                lines=5,
+                interactive=False
             )
+        # Chatterbox Features
+        with gr.Accordion("🌟 Chatterbox Features", open=False):
             gr.Markdown("""
+            ### Why Chatterbox is Special
+            **🎭 Emotion Exaggeration Control**
+            - First open source model with emotion control
+            - Adjust from monotone (0.0) to highly expressive (1.0)
+            - Perfect for creative content, games, and dramatic speech
+            **🌍 Multilingual Support (23 Languages)**
+            - Arabic, Chinese, Danish, Dutch, English, Finnish, French
+            - German, Greek, Hebrew, Hindi, Italian, Japanese, Korean
+            - Malay, Norwegian, Polish, Portuguese, Russian, Spanish
+            - Swedish, Swahili, Turkish
+            **⚡ Technical Advantages**
+            - 0.5B parameter Llama backbone
+            - Zero-shot voice cloning with 5+ seconds of audio
+            - Built-in neural watermarking for responsible AI
+            - MIT licensed - free for commercial use
+            - Consistently outperforms ElevenLabs in evaluations
+            **🎛️ Control Parameters**
+            - **Exaggeration**: Controls emotional intensity (0.0 = monotone, 1.0 = very expressive)
+            - **CFG Scale**: Controls adherence to reference voice (lower = more creative, higher = more accurate)
             """)
+        # Event Handlers
+        voice_clone_btn.click(
+            fn=chatterbox_voice_clone,
+            inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
             outputs=[audio_output, status_output],
             show_progress=True
         )
         text_clone_btn.click(
+            fn=chatterbox_text_to_speech,
+            inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
             outputs=[audio_output, status_output],
             show_progress=True
         )
     return demo
 if __name__ == "__main__":
+    demo = create_chatterbox_interface()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,