Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

6465ea7

verified ·

1 Parent(s): 85f91ef

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -246

app.py CHANGED Viewed

@@ -3,47 +3,49 @@ import torch
 import torchaudio
 import tempfile
 import os
-import logging
-# Setup logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # Device detection
-DEVICE = "cpu"
-if torch.cuda.is_available():
-    DEVICE = "cuda"
-    logger.info("🚀 Running on CUDA GPU")
-else:
-    logger.info("🚀 Running on CPU")
-print(f"🚀 Running on device: {DEVICE}")
-# Global model variables
-ENGLISH_MODEL = None
-MULTILINGUAL_MODEL = None
-def load_chatterbox_models():
-    """Load Chatterbox models"""
-    global ENGLISH_MODEL, MULTILINGUAL_MODEL
-    try:
-        from chatterbox import ChatterboxTTS
-        from chatterbox.tts import ChatterboxMultilingualTTS
-        print("🔄 Loading Chatterbox models...")
-        ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
-        MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
-        print("✅ Models loaded successfully!")
-        return True
-    except Exception as e:
-        print(f"❌ Failed to load Chatterbox models: {e}")
-        return False
-def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
     """
-    🎤 VOICE-TO-VOICE CLONING FUNCTION
-    Takes input audio content and transforms it using reference voice
     """
     try:
         if not reference_audio:
@@ -52,64 +54,50 @@ def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggera
         if not input_audio:
             return None, "❌ Please upload input audio (content to transform)!"
-        print("🔄 Starting Voice-to-Voice cloning...")
         # Step 1: Extract text from input audio using Whisper
-        try:
-            import whisper
-            print("🎤 Transcribing input audio...")
-            whisper_model = whisper.load_model("base")
-            result = whisper_model.transcribe(input_audio)
             extracted_text = result["text"]
-            print(f"📝 Extracted text: {extracted_text}")
-        except Exception as e:
-            print(f"⚠️ Whisper failed: {e}")
             extracted_text = "Voice cloning demonstration using uploaded audio content."
-        # Step 2: Load Chatterbox models if not loaded
-        if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
-            if not load_chatterbox_models():
-                return None, "❌ Chatterbox models failed to load!"
-        # Step 3: Generate voice using Chatterbox
-        print("🎭 Generating cloned voice...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
-        # Use appropriate model based on language
-        if language == "en":
-            model = ENGLISH_MODEL
-            wav = model.generate(
-                extracted_text,
-                audio_prompt_path=reference_audio,
-                exaggeration=exaggeration,
-                cfg=cfg
-            )
-        else:
-            model = MULTILINGUAL_MODEL
-            wav = model.generate(
-                extracted_text,
-                audio_prompt_path=reference_audio,
-                language_id=language,
-                exaggeration=exaggeration,
-                cfg=cfg
-            )
-        # Step 4: Save generated audio
-        torchaudio.save(output_path, wav.cpu(), model.sr)
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"✅ Voice-to-Voice Cloning Complete!\n🎤 Transformed audio content: '{extracted_text[:100]}...'\n🎛️ Settings: Emotion={exaggeration}, CFG={cfg}\n📊 Language: {language}"
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
-        return None, f"❌ Voice-to-Voice cloning error: {str(e)}"
-def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5):
     """
-    📝 TEXT-TO-VOICE CLONING FUNCTION
-    Generates speech from text using reference voice
     """
     try:
         if not reference_audio:
@@ -118,99 +106,79 @@ def text_to_voice_cloning(reference_audio, input_text, language="en", exaggerati
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
-        print("🔄 Starting Text-to-Voice cloning...")
-        print(f"📝 Text to convert: {input_text}")
-        # Load Chatterbox models if not loaded
-        if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
-            if not load_chatterbox_models():
-                return None, "❌ Chatterbox models failed to load!"
-        # Generate speech using Chatterbox
-        print("🎭 Generating speech...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
-        # Use appropriate model based on language
-        if language == "en":
-            model = ENGLISH_MODEL
-            wav = model.generate(
-                input_text,
-                audio_prompt_path=reference_audio,
-                exaggeration=exaggeration,
-                cfg=cfg
-            )
-        else:
-            model = MULTILINGUAL_MODEL
-            wav = model.generate(
-                input_text,
-                audio_prompt_path=reference_audio,
-                language_id=language,
-                exaggeration=exaggeration,
-                cfg=cfg
-            )
-        # Save generated audio
-        torchaudio.save(output_path, wav.cpu(), model.sr)
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"✅ Text-to-Voice Complete!\n📝 Generated speech: '{input_text[:100]}...'\n🎛️ Settings: Emotion={exaggeration}, CFG={cfg}\n📊 Language: {language}"
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
-        return None, f"❌ Text-to-Voice error: {str(e)}"
-# Try to load models at startup
-try:
-    models_loaded = load_chatterbox_models()
-    startup_message = "✅ Chatterbox Models Ready!" if models_loaded else "⚠️ Models will load on first use"
-except Exception as e:
-    models_loaded = False
-    startup_message = f"⚠️ Model loading will be attempted on first use: {str(e)}"
-# Create Gradio interface with tabs
 with gr.Blocks(
-    title="🎭 Complete Voice Cloning Studio",
-    theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
 ) as demo:
-    # Header
     gr.HTML("""
     <div style="text-align: center; padding: 20px;">
-        <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Complete Voice Cloning Studio</h1>
-        <p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Chatterbox AI</p>
-        <p style="color: #888; font-size: 14px;">Both functionalities included - Choose your input method below</p>
     </div>
     """)
-    # Model Status
     gr.HTML(f"""
-    <div style="text-align: center; padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
-        <strong>🤖 Chatterbox Status:</strong> {startup_message}
     </div>
     """)
-    # Reference Voice (shared across both tabs)
-    gr.HTML("<h3 style='color: #8B5CF6; text-align: center;'>🎤 Reference Voice (Voice to Clone)</h3>")
     reference_audio = gr.Audio(
-        label="Upload Reference Audio (5+ seconds of clear speech)",
         type="filepath",
         sources=["upload", "microphone"]
     )
-    gr.HTML("<p style='color: #666; text-align: center; margin-bottom: 20px;'>📌 This voice will be cloned and applied to your content</p>")
-    # Tabs for different input methods
     with gr.Tabs():
-        # TAB 1: VOICE-TO-VOICE CLONING
         with gr.TabItem("🎵 Voice-to-Voice Cloning"):
             gr.HTML("""
-            <div style="padding: 15px; background: #f0f8ff; border-radius: 10px; margin-bottom: 15px;">
-                <h4 style="color: #4169E1; margin-bottom: 10px;">🎤 Voice-to-Voice Process:</h4>
-                <p style="margin: 0;">1. Upload reference voice (person to clone)<br>
-                2. Upload input audio (content to transform)<br>
-                3. AI extracts speech content from input<br>
-                4. Reference voice applied to extracted content</p>
             </div>
             """)
@@ -222,162 +190,109 @@ with gr.Blocks(
                         sources=["upload", "microphone"]
                     )
-                    with gr.Row():
-                        voice_language = gr.Dropdown(
-                            choices=[
-                                ("🇺🇸 English", "en"),
-                                ("🇪🇸 Spanish", "es"),
-                                ("🇫🇷 French", "fr"),
-                                ("🇩🇪 German", "de"),
-                                ("🇮🇹 Italian", "it"),
-                                ("🇧🇷 Portuguese", "pt"),
-                                ("🇨🇳 Chinese", "zh"),
-                                ("🇯🇵 Japanese", "ja"),
-                                ("🇰🇷 Korean", "ko"),
-                                ("🇷🇺 Russian", "ru")
-                            ],
-                            value="en",
-                            label="Output Language"
-                        )
-                        voice_exaggeration = gr.Slider(
-                            minimum=0.0,
-                            maximum=2.0,
-                            step=0.1,
-                            value=0.5,
-                            label="🎭 Emotion Exaggeration"
-                        )
-                        voice_cfg = gr.Slider(
-                            minimum=0.1,
-                            maximum=1.0,
-                            step=0.1,
-                            value=0.5,
-                            label="🎛️ CFG Scale (Accuracy)"
-                        )
-                    voice_clone_btn = gr.Button(
                         "🎤 Transform Voice (Audio → Cloned Audio)",
                         variant="primary",
                         size="lg"
                     )
                 with gr.Column():
-                    voice_output_audio = gr.Audio(
-                        label="Voice-to-Voice Result",
-                        type="filepath"
-                    )
                     voice_status = gr.Textbox(
                         label="Voice-to-Voice Status",
                         lines=6,
                         interactive=False
                     )
-        # TAB 2: TEXT-TO-VOICE CLONING
         with gr.TabItem("📝 Text-to-Speech Cloning"):
             gr.HTML("""
             <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
-                <h4 style="color: #228B22; margin-bottom: 10px;">📝 Text-to-Speech Process:</h4>
-                <p style="margin: 0;">1. Upload reference voice (person to clone)<br>
-                2. Enter text to convert to speech<br>
-                3. AI generates speech in cloned voice<br>
-                4. Download high-quality audio result</p>
             </div>
             """)
             with gr.Row():
                 with gr.Column():
                     text_input = gr.Textbox(
-                        label="Text to Convert to Speech",
-                        placeholder="Enter the text you want to speak in the cloned voice...",
-                        lines=5,
-                        max_lines=8
                     )
-                    with gr.Row():
-                        text_language = gr.Dropdown(
-                            choices=[
-                                ("🇺🇸 English", "en"),
-                                ("🇪🇸 Spanish", "es"),
-                                ("🇫🇷 French", "fr"),
-                                ("🇩🇪 German", "de"),
-                                ("🇮🇹 Italian", "it"),
-                                ("🇧🇷 Portuguese", "pt"),
-                                ("🇨🇳 Chinese", "zh"),
-                                ("🇯🇵 Japanese", "ja")
-                            ],
-                            value="en",
-                            label="Speech Language"
-                        )
-                        text_exaggeration = gr.Slider(
-                            minimum=0.0,
-                            maximum=2.0,
-                            step=0.1,
-                            value=0.5,
-                            label="🎭 Emotion Exaggeration"
-                        )
-                        text_cfg = gr.Slider(
-                            minimum=0.1,
-                            maximum=1.0,
-                            step=0.1,
-                            value=0.5,
-                            label="🎛️ CFG Scale (Accuracy)"
-                        )
-                    text_clone_btn = gr.Button(
                         "📝 Generate Speech (Text → Cloned Audio)",
                         variant="secondary",
                         size="lg"
                     )
                 with gr.Column():
-                    text_output_audio = gr.Audio(
-                        label="Text-to-Speech Result",
-                        type="filepath"
-                    )
                     text_status = gr.Textbox(
                         label="Text-to-Speech Status",
                         lines=6,
                         interactive=False
                     )
-    # Examples Section
     with gr.Accordion("💡 Example Texts", open=False):
         examples = [
-            "Hello, this is a demonstration of AI voice cloning technology using Chatterbox.",
-            "The weather is beautiful today, perfect for a walk in the park with friends.",
-            "Artificial intelligence is revolutionizing the way we create and share content.",
-            "This advanced voice cloning system can generate natural speech in multiple languages."
         ]
-        gr.Examples(
-            examples=examples,
-            inputs=text_input,
-            label="Click to use these example texts:"
-        )
-    # Event Handlers - BOTH FUNCTIONS CONNECTED
-    voice_clone_btn.click(
-        fn=voice_to_voice_cloning,
-        inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
-        outputs=[voice_output_audio, voice_status],
         show_progress=True
     )
-    text_clone_btn.click(
-        fn=text_to_voice_cloning,
-        inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
-        outputs=[text_output_audio, text_status],
         show_progress=True
     )
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 import torchaudio
 import tempfile
 import os
 # Device detection
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🚀 Using device: {DEVICE}")
+# Global models
+TTS_MODEL = None
+WHISPER_MODEL = None
+def load_models():
+    """Load TTS models with proper error handling"""
+    global TTS_MODEL, WHISPER_MODEL
+    print("🔄 Loading models...")
+    # Load XTTS-v2 (most reliable for voice cloning)
+    if TTS_MODEL is None:
+        try:
+            from TTS.api import TTS
+            os.environ["COQUI_TOS_AGREED"] = "1"
+            print("📦 Loading XTTS-v2...")
+            TTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE)
+            print("✅ XTTS-v2 loaded successfully!")
+        except Exception as e:
+            print(f"❌ XTTS-v2 failed: {e}")
+            return False
+    # Load Whisper for voice-to-voice
+    if WHISPER_MODEL is None:
+        try:
+            import whisper
+            print("📦 Loading Whisper...")
+            WHISPER_MODEL = whisper.load_model("base")
+            print("✅ Whisper loaded successfully!")
+        except Exception as e:
+            print(f"❌ Whisper failed: {e}")
+    return TTS_MODEL is not None
+def voice_to_voice_clone(reference_audio, input_audio, language="en"):
     """
+    🎤 VOICE-TO-VOICE CLONING - Real Implementation
+    Transform input audio content using reference voice characteristics
     """
     try:
         if not reference_audio:
         if not input_audio:
             return None, "❌ Please upload input audio (content to transform)!"
+        # Load models
+        if not load_models():
+            return None, "❌ XTTS-v2 model failed to load!"
+        print("🎤 Starting Voice-to-Voice Cloning...")
         # Step 1: Extract text from input audio using Whisper
+        if WHISPER_MODEL:
+            print("📝 Transcribing input audio...")
+            result = WHISPER_MODEL.transcribe(input_audio)
             extracted_text = result["text"]
+            print(f"✅ Extracted: {extracted_text[:100]}...")
+        else:
             extracted_text = "Voice cloning demonstration using uploaded audio content."
+            print("⚠️ Using fallback text")
+        # Step 2: Generate new audio with reference voice using XTTS-v2
+        print("🎭 Generating speech with cloned voice...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        # Use XTTS-v2 for voice cloning
+        TTS_MODEL.tts_to_file(
+            text=extracted_text,
+            speaker_wav=reference_audio,
+            language=language,
+            file_path=output_path
+        )
+        # Verify output
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ Voice-to-Voice Cloning Complete!\n🎤 Original content: '{extracted_text[:100]}...'\n🎭 Applied reference voice characteristics\n📊 Language: {language}\n🤖 Model: XTTS-v2"
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
+        error_msg = f"❌ Voice-to-Voice Error: {str(e)}"
+        print(error_msg)
+        return None, error_msg
+def text_to_voice_clone(reference_audio, input_text, language="en"):
     """
+    📝 TEXT-TO-VOICE CLONING - Real Implementation
     """
     try:
         if not reference_audio:
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
+        # Load models
+        if not load_models():
+            return None, "❌ XTTS-v2 model failed to load!"
+        print("📝 Starting Text-to-Voice Cloning...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        # Generate speech using XTTS-v2
+        TTS_MODEL.tts_to_file(
+            text=input_text,
+            speaker_wav=reference_audio,
+            language=language,
+            file_path=output_path
+        )
+        # Verify output
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ Text-to-Voice Complete!\n📝 Generated: '{input_text[:100]}...'\n🎭 Using reference voice characteristics\n📊 Language: {language}\n🤖 Model: XTTS-v2"
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
+        error_msg = f"❌ Text-to-Voice Error: {str(e)}"
+        print(error_msg)
+        return None, error_msg
+# Try loading models at startup
+startup_success = load_models()
+startup_msg = "✅ XTTS-v2 Ready for Voice Cloning!" if startup_success else "⚠️ Models will load on first use"
+# Create Gradio interface with BOTH functionalities
 with gr.Blocks(
+    title="🎭 Voice Cloning Studio - XTTS-v2",
+    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
 ) as demo:
     gr.HTML("""
     <div style="text-align: center; padding: 20px;">
+        <h1 style="color: #2E86AB;">🎭 Voice Cloning Studio</h1>
+        <p style="color: #666; font-size: 18px;">Real Voice-to-Voice & Text-to-Speech Cloning</p>
+        <p style="color: #888; font-size: 14px;">Powered by XTTS-v2 - Production Ready Open Source Model</p>
     </div>
     """)
+    # Status
+    status_color = "#d4edda" if startup_success else "#fff3cd"
     gr.HTML(f"""
+    <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
+        <strong>🤖 Model Status:</strong> {startup_msg}
     </div>
     """)
+    # Reference Voice (shared)
+    gr.HTML("<h3 style='color: #2E86AB; text-align: center;'>🎤 Reference Voice (Voice to Clone)</h3>")
     reference_audio = gr.Audio(
+        label="Upload Reference Audio (6+ seconds recommended)",
         type="filepath",
         sources=["upload", "microphone"]
     )
+    # Tabs for different modes
     with gr.Tabs():
+        # VOICE-TO-VOICE CLONING TAB
         with gr.TabItem("🎵 Voice-to-Voice Cloning"):
             gr.HTML("""
+            <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
+                <h4>🎤 Voice-to-Voice Process:</h4>
+                <p><strong>1.</strong> Upload reference voice (person to clone)<br>
+                <strong>2.</strong> Upload input audio (speech content to transform)<br>
+                <strong>3.</strong> AI extracts text from input audio using Whisper<br>
+                <strong>4.</strong> XTTS-v2 generates new audio with reference voice + extracted content</p>
             </div>
             """)
                         sources=["upload", "microphone"]
                     )
+                    voice_lang = gr.Dropdown(
+                        choices=[
+                            ("🇺🇸 English", "en"),
+                            ("🇪🇸 Spanish", "es"),
+                            ("🇫🇷 French", "fr"),
+                            ("🇩🇪 German", "de"),
+                            ("🇮🇹 Italian", "it"),
+                            ("🇧🇷 Portuguese", "pt"),
+                            ("🇨🇳 Chinese", "zh"),
+                            ("🇯🇵 Japanese", "ja"),
+                            ("🇰🇷 Korean", "ko"),
+                            ("🇷🇺 Russian", "ru")
+                        ],
+                        value="en",
+                        label="Language"
+                    )
+                    voice_btn = gr.Button(
                         "🎤 Transform Voice (Audio → Cloned Audio)",
                         variant="primary",
                         size="lg"
                     )
                 with gr.Column():
+                    voice_output = gr.Audio(label="Voice-to-Voice Result")
                     voice_status = gr.Textbox(
                         label="Voice-to-Voice Status",
                         lines=6,
                         interactive=False
                     )
+        # TEXT-TO-VOICE CLONING TAB
         with gr.TabItem("📝 Text-to-Speech Cloning"):
             gr.HTML("""
             <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
+                <h4>📝 Text-to-Speech Process:</h4>
+                <p><strong>1.</strong> Upload reference voice (person to clone)<br>
+                <strong>2.</strong> Enter text to convert to speech<br>
+                <strong>3.</strong> XTTS-v2 generates speech directly in the cloned voice<br>
+                <strong>4.</strong> Download high-quality result</p>
             </div>
             """)
             with gr.Row():
                 with gr.Column():
                     text_input = gr.Textbox(
+                        label="Text to Convert",
+                        placeholder="Enter text to speak in the cloned voice...",
+                        lines=5
                     )
+                    text_lang = gr.Dropdown(
+                        choices=[
+                            ("🇺🇸 English", "en"),
+                            ("🇪🇸 Spanish", "es"),
+                            ("🇫🇷 French", "fr"),
+                            ("🇩🇪 German", "de"),
+                            ("🇮🇹 Italian", "it"),
+                            ("🇧🇷 Portuguese", "pt"),
+                            ("🇨🇳 Chinese", "zh"),
+                            ("🇯🇵 Japanese", "ja")
+                        ],
+                        value="en",
+                        label="Language"
+                    )
+                    text_btn = gr.Button(
                         "📝 Generate Speech (Text → Cloned Audio)",
                         variant="secondary",
                         size="lg"
                     )
                 with gr.Column():
+                    text_output = gr.Audio(label="Text-to-Speech Result")
                     text_status = gr.Textbox(
                         label="Text-to-Speech Status",
                         lines=6,
                         interactive=False
                     )
+    # Examples
     with gr.Accordion("💡 Example Texts", open=False):
         examples = [
+            "Hello, this is a demonstration of AI voice cloning using XTTS-v2.",
+            "The weather today is absolutely beautiful, perfect for a walk in the park.",
+            "Artificial intelligence continues to revolutionize how we create and share content."
         ]
+        gr.Examples(examples=examples, inputs=text_input)
+    # Connect both functions - VOICE-TO-VOICE AND TEXT-TO-SPEECH
+    voice_btn.click(
+        fn=voice_to_voice_clone,
+        inputs=[reference_audio, input_audio, voice_lang],
+        outputs=[voice_output, voice_status],
         show_progress=True
     )
+    text_btn.click(
+        fn=text_to_voice_clone,
+        inputs=[reference_audio, text_input, text_lang],
+        outputs=[text_output, text_status],
         show_progress=True
     )
 if __name__ == "__main__":
+    demo.launch()