Spaces:

codewithjarair
/

Chatterbox_tts

Running

App Files Files Community

codewithjarair commited on 14 days ago

Commit

d996a8f

verified ·

1 Parent(s): 621e6b2

Update app.py

Browse files

Files changed (1) hide show

app.py +205 -308

app.py CHANGED Viewed

@@ -1,339 +1,236 @@
-import gradio as gr
 import os
-from voice_cloning_engine import VoiceCloningEngine
 import tempfile
-from pathlib import Path
-# Initialize the voice cloning engine
-print("Initializing Voice Cloning Engine...")
-engine = VoiceCloningEngine()
-print("Engine ready!")
-def generate_voice(
-    text: str,
-    reference_audio,
-    exaggeration: float,
-    cfg: float,
-    seed: int,
-    max_words_per_chunk: int,
-    use_seed: bool,
-    language: str
-):
     """
-    Generate cloned voice audio using Chatterbox
-    Args:
-        text: Text to synthesize
-        reference_audio: Uploaded reference audio file
-        exaggeration: Emotion exaggeration (0.0-1.0+)
-        cfg: Classifier-Free Guidance weight (0.0-1.0)
-        seed: Random seed
-        max_words_per_chunk: Max words per chunk
-        use_seed: Whether to use the seed value
-        language: Language code for multilingual model
-    Returns:
-        Tuple of (audio_path, info_text)
     """
-    try:
-        # Validate inputs
-        if not text or text.strip() == "":
-            return None, "❌ Error: Please enter some text to synthesize."
-        if reference_audio is None:
-            return None, "❌ Error: Please upload a reference audio file."
-        # Get reference audio path
-        if isinstance(reference_audio, str):
-            ref_audio_path = reference_audio
         else:
-            ref_audio_path = reference_audio.name if hasattr(reference_audio, 'name') else reference_audio
-        # Word count
-        word_count = len(text.split())
-        # Prepare seed
-        actual_seed = seed if use_seed else None
-        # Generate status message
-        status = f"🎙️ **Generating speech with Chatterbox...**\n\n"
-        status += f"📝 Words: {word_count}\n"
-        if word_count > max_words_per_chunk:
-            num_chunks = (word_count // max_words_per_chunk) + 1
-            status += f"📦 Will be split into ~{num_chunks} chunks\n"
-        status += f"🎭 Exaggeration: {exaggeration}\n"
-        status += f"🎚️ CFG: {cfg}\n"
-        status += f"🎲 Seed: {seed if use_seed else 'Random'}\n"
-        status += f"🌍 Language: {language.upper()}\n"
-        print(status)
-        # Generate audio
-        output_path = engine.generate_speech(
-            text=text,
-            reference_audio_path=ref_audio_path,
-            exaggeration=exaggeration,
-            cfg=cfg,
-            seed=actual_seed,
-            max_words_per_chunk=max_words_per_chunk,
-            language=language
-        )
-        # Get duration
-        duration = engine.get_audio_duration(output_path)
-        # Success message
-        success_msg = f"✅ **Generation Complete!**\n\n"
-        success_msg += f"📊 Audio Duration: {duration:.2f} seconds\n"
-        success_msg += f"📝 Words Synthesized: {word_count}\n"
-        success_msg += f"⚡ Speed: {word_count/duration:.1f} words/second\n"
-        success_msg += f"\n💧 *Audio includes Perth watermark for authentication*"
-        return output_path, success_msg
-    except Exception as e:
-        error_msg = f"❌ **Error during generation:**\n\n{str(e)}"
-        print(error_msg)
-        return None, error_msg
-def update_seed_visibility(use_seed):
-    """Toggle seed input visibility"""
-    return gr.update(visible=use_seed)
-def estimate_chunks(text, max_words):
-    """Estimate number of chunks for given text"""
-    if not text:
-        return "📦 Chunks: 0"
-    word_count = len(text.split())
-    if word_count <= max_words:
-        return f"📦 Chunks: 1 (Text is within limit)"
-    else:
-        num_chunks = (word_count // max_words) + 1
-        return f"📦 Estimated Chunks: {num_chunks}"
-# Create Gradio interface
-with gr.Blocks(
-    title="🎙️ Chatterbox TTS Voice Cloning",
-    theme=gr.themes.Soft(
-        primary_hue="blue",
-        secondary_hue="slate",
-    )
-) as app:
-    gr.Markdown(
-        """
-        # 🎙️ Resemble AI Chatterbox Voice Cloning
-        Clone any voice by providing a reference audio sample! Powered by **Chatterbox Turbo** - the state-of-the-art, open-source TTS model.
-        ### 📋 Instructions:
-        1. **Upload** a reference audio file (10+ seconds recommended, WAV preferred)
-        2. **Enter** the text you want to synthesize
-        3. **Adjust** emotion exaggeration and CFG if needed (optional)
-        4. **Click** Generate to create the cloned voice
-        ### ✨ Special Features:
-        - 🎭 **Paralinguistic Tags**: Use [laugh], [chuckle], [cough], [sigh] in your text
-        - 🎚️ **Emotion Control**: Adjust exaggeration from monotone to expressive
-        - ⚡ **Auto-chunking**: Long texts automatically split for better quality
-        - 💧 **Perth Watermark**: All outputs include imperceptible authentication watermark
-        **Note**: Outperforms ElevenLabs in blind evaluations • MIT Licensed • Open Source
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            # Reference Audio Upload
-            gr.Markdown("### 🎵 Reference Audio")
-            reference_audio = gr.Audio(
-                label="Upload Reference Audio",
-                type="filepath",
-                sources=["upload", "microphone"]
-            )
-            gr.Markdown(
-                """
-                💡 **Tip**: Use clear audio (10-30 seconds) with minimal background noise for best results.
-                """
             )
-            # Text Input
-            gr.Markdown("### 📝 Text to Synthesize")
-            text_input = gr.Textbox(
-                label="Enter Text",
-                placeholder="Type or paste the text you want to convert to speech...",
-                lines=8,
-                max_lines=20
-            )
-            chunk_estimate = gr.Markdown("📦 Chunks: 0")
-        with gr.Column(scale=1):
-            # Parameters
-            gr.Markdown("### ⚙️ Generation Parameters")
-            with gr.Accordion("🎛️ Basic Settings", open=True):
-                exaggeration = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.5,
-                    value=0.5,
-                    step=0.1,
-                    label="Emotion Exaggeration",
-                    info="Controls emotion intensity (0.0=monotone, 0.5=natural, 1.0+=expressive)"
                 )
-                cfg = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.5,
-                    step=0.1,
-                    label="CFG (Classifier-Free Guidance)",
-                    info="Lower values for faster speech, higher for more deliberate pacing"
                 )
-                max_words_per_chunk = gr.Slider(
-                    minimum=50,
-                    maximum=500,
-                    value=300,
-                    step=50,
-                    label="Max Words Per Chunk",
-                    info="Texts longer than this will be auto-chunked"
-                )
-            with gr.Accordion("🔧 Advanced Settings", open=False):
-                language = gr.Dropdown(
-                    choices=[
-                        "en", "es", "fr", "de", "it", "pt", "ru", "zh",
-                        "ja", "ko", "ar", "hi", "nl", "pl", "tr", "sv",
-                        "no", "da", "fi", "el", "he", "ms", "sw"
-                    ],
-                    value="en",
-                    label="Language",
-                    info="For multilingual model (English by default)"
-                )
-                use_seed = gr.Checkbox(
-                    label="Use Fixed Seed (for reproducibility)",
-                    value=False
-                )
-                seed_input = gr.Number(
-                    label="Random Seed",
-                    value=42,
-                    precision=0,
-                    visible=False
-                )
-            # Generate Button
-            generate_btn = gr.Button(
-                "🎙️ Generate Voice",
-                variant="primary",
-                size="lg"
-            )
-            # Output
-            gr.Markdown("### 🔊 Generated Audio")
-            output_audio = gr.Audio(
-                label="Generated Speech",
-                type="filepath"
-            )
-            output_info = gr.Markdown("")
-    # Examples
-    gr.Markdown("### 📚 Example Texts")
-    gr.Examples(
-        examples=[
-            ["Hello! This is a test of the Chatterbox voice cloning system. I hope it sounds natural and clear."],
-            ["Hi there [chuckle], thanks for calling! How can I help you today?"],
-            ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."],
-            ["Artificial intelligence has made remarkable progress in recent years, particularly in the field of natural language processing and speech synthesis."],
-            ["Once upon a time, in a land far away, there lived a curious inventor who dreamed of creating machines that could speak with human voices."],
-            ["Oh wow [laugh], that's amazing! I can't believe it actually works this well [chuckle]."]
-        ],
-        inputs=text_input,
-        label="Click to load example text (Turbo model supports [laugh], [chuckle], [cough], etc.)"
-    )
-    # Event handlers
-    use_seed.change(
-        fn=update_seed_visibility,
-        inputs=use_seed,
-        outputs=seed_input
-    )
-    text_input.change(
-        fn=estimate_chunks,
-        inputs=[text_input, max_words_per_chunk],
-        outputs=chunk_estimate
-    )
-    max_words_per_chunk.change(
-        fn=estimate_chunks,
-        inputs=[text_input, max_words_per_chunk],
-        outputs=chunk_estimate
-    )
-    generate_btn.click(
-        fn=generate_voice,
-        inputs=[
-            text_input,
-            reference_audio,
-            exaggeration,
-            cfg,
-            seed_input,
-            max_words_per_chunk,
-            use_seed,
-            language
-        ],
-        outputs=[output_audio, output_info]
-    )
-    gr.Markdown(
-        """
-        ---
-        ### ℹ️ About Chatterbox
-        This app uses **Resemble AI's Chatterbox Turbo** - the fastest open-source TTS model. It automatically handles:
-        - ✅ Voice cloning with just 5-30 seconds of audio
-        - ✅ Text chunking for long inputs (auto-concatenation)
-        - ✅ Emotion exaggeration control (unique to Chatterbox)
-        - ✅ Paralinguistic tags: [laugh], [chuckle], [cough], [sigh]
-        - ✅ Perth watermarking for audio authentication
-        **Models Available**:
-        - 🚀 **Turbo**: Fastest, supports paralinguistic tags
-        - 🎯 **Standard**: High quality with emotion control
-        - 🌍 **Multilingual**: 23 languages supported
-        **Source**: [GitHub - Resemble AI Chatterbox](https://github.com/resemble-ai/chatterbox)
-        💡 **Tips for best results**:
-        - Use 10-30 seconds of clear reference audio
-        - WAV format at 24kHz+ recommended
-        - Single speaker, minimal background noise
-        - Try exaggeration=0.7+ for more expressive output
-        - Lower CFG (~0.3) for faster speaking pace
-        - Use paralinguistic tags like [chuckle] for reactions
-        🏆 **Consistently outperforms ElevenLabs** in blind evaluations ([Podonos testing](https://www.resemble.ai/chatterbox/))
-        """
-    )
-# Launch the app
 if __name__ == "__main__":
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 import os
+import random
+import numpy as np
+import torch
+import torchaudio
+import gradio as gr
+import re
 import tempfile
+from chatterbox.tts import ChatterboxTTS
+# Set device
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def set_seed(seed: int):
+    """Set random seed for reproducibility."""
+    if seed == 0:
+        seed = random.randint(1, 1000000)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    return seed
+def split_text(text, max_chars=250):
     """
+    Intelligent text chunking with sentence boundary detection.
+    Splits text into chunks of approximately max_chars, trying to stay on sentence boundaries.
     """
+    # Simple sentence boundary detection using regex
+    # Split by periods, question marks, and exclamation marks followed by whitespace
+    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= max_chars:
+            current_chunk += (sentence + " ")
         else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            # If a single sentence is longer than max_chars, we have to split it
+            if len(sentence) > max_chars:
+                # Further split long sentences by commas or spaces as fallback
+                sub_parts = re.split(r'(?<=,)\s+|\s+', sentence)
+                temp_chunk = ""
+                for part in sub_parts:
+                    if len(temp_chunk) + len(part) <= max_chars:
+                        temp_chunk += (part + " ")
+                    else:
+                        if temp_chunk:
+                            chunks.append(temp_chunk.strip())
+                        temp_chunk = part + " "
+                current_chunk = temp_chunk
+            else:
+                current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def load_model():
+    """Load the Chatterbox TTS model."""
+    try:
+        print(f"Loading Chatterbox TTS model on {DEVICE}...")
+        model = ChatterboxTTS.from_pretrained(DEVICE)
+        return model
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return None
+def generate_tts(model, text, ref_audio, exaggeration, cfg_weight, temperature, seed, progress=gr.Progress()):
+    """
+    Generate TTS audio from text, handling long scripts via chunking.
+    """
+    if model is None:
+        # Try to load if not already loaded (for HF Spaces persistence)
+        model = load_model()
+        if model is None:
+            return None, "Error: Model could not be loaded. Check your environment/GPU."
+    if not text.strip():
+        return None, "Error: Please enter some text."
+    if ref_audio is None:
+        return None, "Error: Please upload a reference audio file for voice cloning."
+    # Set seed
+    actual_seed = set_seed(int(seed))
+    # Chunk the text
+    chunks = split_text(text)
+    total_chunks = len(chunks)
+    if total_chunks == 0:
+        return None, "Error: No valid text to process."
+    all_wavs = []
+    try:
+        for i, chunk in enumerate(chunks):
+            progress((i / total_chunks), desc=f"Processing chunk {i+1}/{total_chunks}")
+            # Generate audio for this chunk
+            # Chatterbox.generate expects: text, audio_prompt_path, exaggeration, temperature, cfg_weight, etc.
+            wav = model.generate(
+                chunk,
+                audio_prompt_path=ref_audio,
+                exaggeration=exaggeration,
+                temperature=temperature,
+                cfg_weight=cfg_weight
             )
+            # wav is usually a torch tensor [1, T] or [T]
+            if wav.dim() == 1:
+                wav = wav.unsqueeze(0)
+            all_wavs.append(wav.cpu())
+        # Concatenate all audio chunks along the time dimension (last dim)
+        if not all_wavs:
+            return None, "Error: No audio was generated."
+        final_wav = torch.cat(all_wavs, dim=-1)
+        # Save to a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            output_path = tmp_file.name
+            torchaudio.save(output_path, final_wav, model.sr)
+        return output_path, f"Successfully generated audio with seed {actual_seed}. Total chunks: {total_chunks}."
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, f"Error during generation: {str(e)}"
+# Define the Gradio Interface
+def create_ui():
+    # Model is loaded once and stored in state
+    model_state = gr.State(None)
+    with gr.Blocks(theme=gr.themes.Soft(), title="Chatterbox Voice Clone TTS") as demo:
+        gr.Markdown("# 🗣️ Voice Cloning TTS Chatterbox")
+        gr.Markdown("""
+        Clone any voice using a short reference audio clip. This application is optimized for long scripts
+        through intelligent sentence-based chunking and sequential processing.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                text_input = gr.Textbox(
+                    label="Script",
+                    placeholder="Enter your long script here. The app will automatically handle chunking...",
+                    lines=10,
+                    value="Welcome to the Chatterbox voice cloning application. This tool allows you to generate high-quality speech from long scripts by automatically splitting them into manageable segments. Simply upload a reference audio clip of the voice you want to clone, and adjust the parameters to your liking."
                 )
+                ref_audio = gr.Audio(
+                    label="Reference Audio (Voice to Clone)",
+                    type="filepath",
+                    sources=["upload", "microphone"]
                 )
+                with gr.Row():
+                    exaggeration = gr.Slider(
+                        0.1, 1.0, value=0.5, step=0.05,
+                        label="Exaggeration",
+                        info="Default 0.5. Extreme values (>0.8) may be unstable."
+                    )
+                    cfg_weight = gr.Slider(
+                        0.0, 1.0, value=0.5, step=0.05,
+                        label="CFG/Pace",
+                        info="Control the pace and guidance scale."
+                    )
+                with gr.Accordion("Advanced Options", open=False):
+                    seed = gr.Number(
+                        label="Seed",
+                        value=0,
+                        precision=0,
+                        info="Set to 0 for random seed each time."
+                    )
+                    temperature = gr.Slider(
+                        0.1, 2.0, value=1.0, step=0.05,
+                        label="Temperature",
+                        info="Higher values increase randomness and expressiveness."
+                    )
+                generate_btn = gr.Button("Generate Audio", variant="primary")
+            with gr.Column(scale=1):
+                audio_output = gr.Audio(label="Generated Speech", type="filepath")
+                status_msg = gr.Textbox(label="Status", interactive=False)
+                gr.Markdown("### 📖 Documentation")
+                gr.Markdown("""
+                ### Features
+                - **Voice Cloning**: Provide a clear 5-10 second reference clip.
+                - **Intelligent Chunking**: Scripts are split at sentence boundaries (approx. 250 chars) to ensure smooth transitions and avoid memory issues.
+                - **Sequential Processing**: Audio chunks are generated one-by-one and concatenated for long-form content.
+                - **Parameter Control**:
+                  - **Exaggeration**: Intensity of cloned voice traits.
+                  - **CFG/Pace**: Balance between text adherence and reference voice speed.
+                  - **Temperature**: Randomness of the output.
+                ### Tips
+                - Use a high-quality, noise-free reference audio for best results.
+                - For dramatic speech, try higher **Exaggeration** and lower **CFG**.
+                - If the output sounds unnatural, try a different **Seed** or adjust **Temperature**.
+                """)
+        # Event handling
+        generate_btn.click(
+            fn=generate_tts,
+            inputs=[
+                model_state,
+                text_input,
+                ref_audio,
+                exaggeration,
+                cfg_weight,
+                temperature,
+                seed
+            ],
+            outputs=[audio_output, status_msg]
+        )
+        # Load model on startup
+        demo.load(fn=load_model, outputs=model_state)
+    return demo
 if __name__ == "__main__":
+    ui = create_ui()
+    # Use server_name="0.0.0.0" for deployment compatibility
+    ui.launch(server_name="0.0.0.0")