Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

930a8ef

verified ·

1 Parent(s): d6ad7c4

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -114

app.py CHANGED Viewed

@@ -19,38 +19,31 @@ else:
 print(f"🚀 Running on device: {DEVICE}")
-# Global models
 ENGLISH_MODEL = None
 MULTILINGUAL_MODEL = None
 def load_chatterbox_models():
-    """Load Chatterbox models with proper error handling"""
     global ENGLISH_MODEL, MULTILINGUAL_MODEL
-    if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
-        try:
-            from chatterbox.tts import ChatterboxTTS
-            from chatterbox.mtl_tts import ChatterboxMultilingualTTS
-            print("🔄 Loading Chatterbox English model...")
-            ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
-            print("✅ English model loaded!")
-            print("🔄 Loading Chatterbox Multilingual model...")
-            MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
-            print("✅ Multilingual model loaded!")
-            return True
-        except Exception as e:
-            print(f"❌ Error loading Chatterbox models: {e}")
-            return False
-    return True
 def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
     """
-    Voice-to-Voice Cloning: Transform input audio using reference voice
     """
     try:
         if not reference_audio:
@@ -59,21 +52,27 @@ def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggera
         if not input_audio:
             return None, "❌ Please upload input audio (content to transform)!"
-        if not load_chatterbox_models():
-            return None, "❌ Chatterbox models failed to load!"
-        # Extract text from input audio using Whisper (for content)
         try:
             import whisper
             whisper_model = whisper.load_model("base")
             result = whisper_model.transcribe(input_audio)
             extracted_text = result["text"]
-            print(f"📝 Extracted text from input audio: {extracted_text}")
         except Exception as e:
-            print(f"⚠️ Whisper transcription failed: {e}")
-            extracted_text = "Voice cloning demonstration using the uploaded audio content."
-        # Create output file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
@@ -96,20 +95,21 @@ def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggera
                 cfg=cfg
             )
-        # Save generated audio
         torchaudio.save(output_path, wav.cpu(), model.sr)
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"✅ Voice-to-Voice Cloning Complete!\n🎤 Reference voice applied to: '{extracted_text[:100]}...'\n🎛️ Settings: Exaggeration={exaggeration}, CFG={cfg}"
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
         return None, f"❌ Voice-to-Voice cloning error: {str(e)}"
-def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5, speed=1.0):
     """
-    Text-to-Voice Cloning: Generate speech from text using reference voice
     """
     try:
         if not reference_audio:
@@ -118,13 +118,16 @@ def text_to_voice_cloning(reference_audio, input_text, language="en", exaggerati
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
-        if not load_chatterbox_models():
-            return None, "❌ Chatterbox models failed to load!"
-        print(f"🎤 Generating speech with Chatterbox...")
-        print(f"📝 Text: {input_text[:100]}...")
-        # Create output file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
@@ -151,60 +154,65 @@ def text_to_voice_cloning(reference_audio, input_text, language="en", exaggerati
         torchaudio.save(output_path, wav.cpu(), model.sr)
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"✅ Text-to-Voice Cloning Complete!\n📝 Generated: '{input_text[:100]}...'\n🎛️ Settings: Exaggeration={exaggeration}, CFG={cfg}"
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
-        return None, f"❌ Text-to-Voice cloning error: {str(e)}"
 # Try to load models at startup
 try:
     models_loaded = load_chatterbox_models()
-    startup_message = "✅ Chatterbox Models Loaded Successfully!" if models_loaded else "❌ Failed to Load Chatterbox Models"
 except Exception as e:
     models_loaded = False
-    startup_message = f"❌ Startup Error: {str(e)}"
-# Create Gradio interface
 with gr.Blocks(
-    title="🎭 Complete Chatterbox Voice Cloning Studio",
     theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
 ) as demo:
     # Header
     gr.HTML("""
     <div style="text-align: center; padding: 20px;">
-        <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Complete Chatterbox Voice Cloning Studio</h1>
-        <p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Emotion Control</p>
-        <p style="color: #888; font-size: 14px;">Powered by Resemble AI's Chatterbox - The Model We Discussed!</p>
     </div>
     """)
     # Model Status
-    status_color = "#d4edda" if models_loaded else "#f8d7da"
     gr.HTML(f"""
-    <div style="text-align: center; padding: 15px; background: {status_color}; border-radius: 10px; margin-bottom: 20px;">
         <strong>🤖 Chatterbox Status:</strong> {startup_message}
     </div>
     """)
-    with gr.Row():
-        with gr.Column():
-            # Reference Voice Section
-            gr.HTML("<h3 style='color: #8B5CF6;'>🎤 Reference Voice (Voice to Clone)</h3>")
-            reference_audio = gr.Audio(
-                label="Upload Reference Audio (5+ seconds)",
-                type="filepath",
-                sources=["upload", "microphone"]
-            )
-            gr.HTML("<p style='color: #666; font-size: 14px;'>📌 This is the voice that will be cloned and applied to your content</p>")
-    # Tabs for different input methods
     with gr.Tabs():
-        # Tab 1: Voice-to-Voice Cloning
         with gr.TabItem("🎵 Voice-to-Voice Cloning"):
-            gr.HTML("<p style='margin-bottom: 15px;'>Upload audio content and transform it using the reference voice</p>")
             with gr.Row():
                 with gr.Column():
@@ -229,7 +237,7 @@ with gr.Blocks(
                                 ("🇷🇺 Russian", "ru")
                             ],
                             value="en",
-                            label="Language"
                         )
                         voice_exaggeration = gr.Slider(
@@ -241,11 +249,11 @@ with gr.Blocks(
                         )
                         voice_cfg = gr.Slider(
-                            minimum=0.2,
                             maximum=1.0,
                             step=0.1,
                             value=0.5,
-                            label="🎛️ CFG Scale"
                         )
                     voice_clone_btn = gr.Button(
@@ -253,17 +261,37 @@ with gr.Blocks(
                         variant="primary",
                         size="lg"
                     )
-        # Tab 2: Text-to-Voice Cloning
         with gr.TabItem("📝 Text-to-Speech Cloning"):
-            gr.HTML("<p style='margin-bottom: 15px;'>Enter text and generate speech using the reference voice</p>")
             with gr.Row():
                 with gr.Column():
                     text_input = gr.Textbox(
                         label="Text to Convert to Speech",
                         placeholder="Enter the text you want to speak in the cloned voice...",
-                        lines=4,
                         max_lines=8
                     )
@@ -280,7 +308,7 @@ with gr.Blocks(
                                 ("🇯🇵 Japanese", "ja")
                             ],
                             value="en",
-                            label="Language"
                         )
                         text_exaggeration = gr.Slider(
@@ -292,11 +320,11 @@ with gr.Blocks(
                         )
                         text_cfg = gr.Slider(
-                            minimum=0.2,
                             maximum=1.0,
                             step=0.1,
                             value=0.5,
-                            label="🎛️ CFG Scale"
                         )
                     text_clone_btn = gr.Button(
@@ -304,69 +332,46 @@ with gr.Blocks(
                         variant="secondary",
                         size="lg"
                     )
-    # Output Section
-    gr.HTML("<h3 style='color: #8B5CF6;'>🎵 Generated Audio Output</h3>")
-    with gr.Row():
-        audio_output = gr.Audio(
-            label="Cloned Voice Result",
-            type="filepath"
-        )
-        status_output = gr.Textbox(
-            label="Processing Status & Details",
-            lines=6,
-            interactive=False
-        )
     # Examples Section
-    with gr.Accordion("💡 Example Texts for Testing", open=False):
         examples = [
-            "Hello, this is a demonstration of real voice cloning technology using Chatterbox.",
             "The weather is beautiful today, perfect for a walk in the park with friends.",
-            "Artificial intelligence is revolutionizing how we create and interact with digital content.",
             "This advanced voice cloning system can generate natural speech in multiple languages."
         ]
         gr.Examples(
             examples=examples,
             inputs=text_input,
-            label="Click to try these example texts:"
         )
-    # How It Works Section
-    with gr.Accordion("🔍 How Voice Cloning Works", open=False):
-        gr.Markdown("""
-        ### Voice-to-Voice Cloning Process
-        1. **🎤 Upload Reference Voice**: The voice you want to clone (5+ seconds)
-        2. **📥 Upload Input Audio**: Audio content you want to transform
-        3. **🧠 Content Extraction**: AI extracts speech content from input audio
-        4. **🎭 Voice Application**: Reference voice characteristics applied to content
-        5. **🎵 Generate Output**: New audio with original content in cloned voice
-        ### Text-to-Speech Process
-        1. **🎤 Upload Reference Voice**: The voice you want to clone
-        2. **📝 Enter Text**: Type the content to convert to speech
-        3. **🎛️ Adjust Controls**: Set emotion and speech parameters
-        4. **🎵 Generate Speech**: Create natural speech in the cloned voice
-        ### Chatterbox Controls
-        - **Emotion Exaggeration**: 0.0 = monotone, 2.0 = very expressive
-        - **CFG Scale**: 0.2 = creative, 1.0 = accurate to reference
-        - **Language Support**: 23+ languages with multilingual model
-        """)
-    # Event Handlers
     voice_clone_btn.click(
         fn=voice_to_voice_cloning,
         inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
-        outputs=[audio_output, status_output],
         show_progress=True
     )
     text_clone_btn.click(
         fn=text_to_voice_cloning,
         inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
-        outputs=[audio_output, status_output],
         show_progress=True
     )

 print(f"🚀 Running on device: {DEVICE}")
+# Global model variables
 ENGLISH_MODEL = None
 MULTILINGUAL_MODEL = None
 def load_chatterbox_models():
+    """Load Chatterbox models"""
     global ENGLISH_MODEL, MULTILINGUAL_MODEL
+    try:
+        from chatterbox import ChatterboxTTS
+        from chatterbox.tts import ChatterboxMultilingualTTS
+        print("🔄 Loading Chatterbox models...")
+        ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
+        MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
+        print("✅ Models loaded successfully!")
+        return True
+    except Exception as e:
+        print(f"❌ Failed to load Chatterbox models: {e}")
+        return False
 def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
     """
+    🎤 VOICE-TO-VOICE CLONING FUNCTION
+    Takes input audio content and transforms it using reference voice
     """
     try:
         if not reference_audio:
         if not input_audio:
             return None, "❌ Please upload input audio (content to transform)!"
+        print("🔄 Starting Voice-to-Voice cloning...")
+        # Step 1: Extract text from input audio using Whisper
         try:
             import whisper
+            print("🎤 Transcribing input audio...")
             whisper_model = whisper.load_model("base")
             result = whisper_model.transcribe(input_audio)
             extracted_text = result["text"]
+            print(f"📝 Extracted text: {extracted_text}")
         except Exception as e:
+            print(f"⚠️ Whisper failed: {e}")
+            extracted_text = "Voice cloning demonstration using uploaded audio content."
+        # Step 2: Load Chatterbox models if not loaded
+        if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
+            if not load_chatterbox_models():
+                return None, "❌ Chatterbox models failed to load!"
+        # Step 3: Generate voice using Chatterbox
+        print("🎭 Generating cloned voice...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
                 cfg=cfg
             )
+        # Step 4: Save generated audio
         torchaudio.save(output_path, wav.cpu(), model.sr)
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ Voice-to-Voice Cloning Complete!\n🎤 Transformed audio content: '{extracted_text[:100]}...'\n🎛️ Settings: Emotion={exaggeration}, CFG={cfg}\n📊 Language: {language}"
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
         return None, f"❌ Voice-to-Voice cloning error: {str(e)}"
+def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5):
     """
+    📝 TEXT-TO-VOICE CLONING FUNCTION
+    Generates speech from text using reference voice
     """
     try:
         if not reference_audio:
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
+        print("🔄 Starting Text-to-Voice cloning...")
+        print(f"📝 Text to convert: {input_text}")
+        # Load Chatterbox models if not loaded
+        if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
+            if not load_chatterbox_models():
+                return None, "❌ Chatterbox models failed to load!"
+        # Generate speech using Chatterbox
+        print("🎭 Generating speech...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
         torchaudio.save(output_path, wav.cpu(), model.sr)
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, f"✅ Text-to-Voice Complete!\n📝 Generated speech: '{input_text[:100]}...'\n🎛️ Settings: Emotion={exaggeration}, CFG={cfg}\n📊 Language: {language}"
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
+        return None, f"❌ Text-to-Voice error: {str(e)}"
 # Try to load models at startup
 try:
     models_loaded = load_chatterbox_models()
+    startup_message = "✅ Chatterbox Models Ready!" if models_loaded else "⚠️ Models will load on first use"
 except Exception as e:
     models_loaded = False
+    startup_message = f"⚠️ Model loading will be attempted on first use: {str(e)}"
+# Create Gradio interface with tabs
 with gr.Blocks(
+    title="🎭 Complete Voice Cloning Studio",
     theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
 ) as demo:
     # Header
     gr.HTML("""
     <div style="text-align: center; padding: 20px;">
+        <h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Complete Voice Cloning Studio</h1>
+        <p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Chatterbox AI</p>
+        <p style="color: #888; font-size: 14px;">Both functionalities included - Choose your input method below</p>
     </div>
     """)
     # Model Status
     gr.HTML(f"""
+    <div style="text-align: center; padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
         <strong>🤖 Chatterbox Status:</strong> {startup_message}
     </div>
     """)
+    # Reference Voice (shared across both tabs)
+    gr.HTML("<h3 style='color: #8B5CF6; text-align: center;'>🎤 Reference Voice (Voice to Clone)</h3>")
+    reference_audio = gr.Audio(
+        label="Upload Reference Audio (5+ seconds of clear speech)",
+        type="filepath",
+        sources=["upload", "microphone"]
+    )
+    gr.HTML("<p style='color: #666; text-align: center; margin-bottom: 20px;'>📌 This voice will be cloned and applied to your content</p>")
+    # Tabs for different input methods
     with gr.Tabs():
+        # TAB 1: VOICE-TO-VOICE CLONING
         with gr.TabItem("🎵 Voice-to-Voice Cloning"):
+            gr.HTML("""
+            <div style="padding: 15px; background: #f0f8ff; border-radius: 10px; margin-bottom: 15px;">
+                <h4 style="color: #4169E1; margin-bottom: 10px;">🎤 Voice-to-Voice Process:</h4>
+                <p style="margin: 0;">1. Upload reference voice (person to clone)<br>
+                2. Upload input audio (content to transform)<br>
+                3. AI extracts speech content from input<br>
+                4. Reference voice applied to extracted content</p>
+            </div>
+            """)
             with gr.Row():
                 with gr.Column():
                                 ("🇷🇺 Russian", "ru")
                             ],
                             value="en",
+                            label="Output Language"
                         )
                         voice_exaggeration = gr.Slider(
                         )
                         voice_cfg = gr.Slider(
+                            minimum=0.1,
                             maximum=1.0,
                             step=0.1,
                             value=0.5,
+                            label="🎛️ CFG Scale (Accuracy)"
                         )
                     voice_clone_btn = gr.Button(
                         variant="primary",
                         size="lg"
                     )
+                with gr.Column():
+                    voice_output_audio = gr.Audio(
+                        label="Voice-to-Voice Result",
+                        type="filepath"
+                    )
+                    voice_status = gr.Textbox(
+                        label="Voice-to-Voice Status",
+                        lines=6,
+                        interactive=False
+                    )
+        # TAB 2: TEXT-TO-VOICE CLONING
         with gr.TabItem("📝 Text-to-Speech Cloning"):
+            gr.HTML("""
+            <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
+                <h4 style="color: #228B22; margin-bottom: 10px;">📝 Text-to-Speech Process:</h4>
+                <p style="margin: 0;">1. Upload reference voice (person to clone)<br>
+                2. Enter text to convert to speech<br>
+                3. AI generates speech in cloned voice<br>
+                4. Download high-quality audio result</p>
+            </div>
+            """)
             with gr.Row():
                 with gr.Column():
                     text_input = gr.Textbox(
                         label="Text to Convert to Speech",
                         placeholder="Enter the text you want to speak in the cloned voice...",
+                        lines=5,
                         max_lines=8
                     )
                                 ("🇯🇵 Japanese", "ja")
                             ],
                             value="en",
+                            label="Speech Language"
                         )
                         text_exaggeration = gr.Slider(
                         )
                         text_cfg = gr.Slider(
+                            minimum=0.1,
                             maximum=1.0,
                             step=0.1,
                             value=0.5,
+                            label="🎛️ CFG Scale (Accuracy)"
                         )
                     text_clone_btn = gr.Button(
                         variant="secondary",
                         size="lg"
                     )
+                with gr.Column():
+                    text_output_audio = gr.Audio(
+                        label="Text-to-Speech Result",
+                        type="filepath"
+                    )
+                    text_status = gr.Textbox(
+                        label="Text-to-Speech Status",
+                        lines=6,
+                        interactive=False
+                    )
     # Examples Section
+    with gr.Accordion("💡 Example Texts", open=False):
         examples = [
+            "Hello, this is a demonstration of AI voice cloning technology using Chatterbox.",
             "The weather is beautiful today, perfect for a walk in the park with friends.",
+            "Artificial intelligence is revolutionizing the way we create and share content.",
             "This advanced voice cloning system can generate natural speech in multiple languages."
         ]
         gr.Examples(
             examples=examples,
             inputs=text_input,
+            label="Click to use these example texts:"
         )
+    # Event Handlers - BOTH FUNCTIONS CONNECTED
     voice_clone_btn.click(
         fn=voice_to_voice_cloning,
         inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
+        outputs=[voice_output_audio, voice_status],
         show_progress=True
     )
     text_clone_btn.click(
         fn=text_to_voice_cloning,
         inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
+        outputs=[text_output_audio, text_status],
         show_progress=True
     )