Spaces:

codewithjarair
/

Chatterbox_tts

Running

App Files Files Community

codewithjarair commited on 1 day ago

Commit

c26c2ec

verified ·

1 Parent(s): 8b9660d

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -202

app.py CHANGED Viewed

@@ -1,110 +1,99 @@
 import os
 import random
-import numpy as np
 import torch
 import torchaudio
 import gradio as gr
-import re
-import tempfile
 from chatterbox.tts import ChatterboxTTS
-# Set device
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-def set_seed(seed: int):
-    """Set random seed for reproducibility."""
-    if seed == 0:
-        seed = random.randint(1, 1000000)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    random.seed(seed)
-    np.random.seed(seed)
-    return seed
-def split_text(text, max_chars=250):
     """
-    Intelligent text chunking with sentence boundary detection.
-    Splits text into chunks of approximately max_chars, trying to stay on sentence boundaries.
     """
-    # Simple sentence boundary detection using regex
-    # Split by periods, question marks, and exclamation marks followed by whitespace
-    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) <= max_chars:
-            current_chunk += (sentence + " ")
-        else:
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-            # If a single sentence is longer than max_chars, we have to split it
-            if len(sentence) > max_chars:
-                # Further split long sentences by commas or spaces as fallback
-                sub_parts = re.split(r'(?<=,)\s+|\s+', sentence)
-                temp_chunk = ""
-                for part in sub_parts:
-                    if len(temp_chunk) + len(part) <= max_chars:
-                        temp_chunk += (part + " ")
-                    else:
-                        if temp_chunk:
-                            chunks.append(temp_chunk.strip())
-                        temp_chunk = part + " "
-                current_chunk = temp_chunk
-            else:
-                current_chunk = sentence + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-def load_model():
-    """Load the Chatterbox TTS model."""
-    try:
-        print(f"Loading Chatterbox TTS model on {DEVICE}...")
-        model = ChatterboxTTS.from_pretrained(DEVICE)
-        return model
-    except Exception as e:
-        print(f"Error loading model: {e}")
-        return None
-def generate_tts(model, text, ref_audio, exaggeration, cfg_weight, temperature, seed, progress=gr.Progress()):
-    """
-    Generate TTS audio from text, handling long scripts via chunking.
-    """
-    if model is None:
-        # Try to load if not already loaded (for HF Spaces persistence)
-        model = load_model()
-        if model is None:
-            return None, "Error: Model could not be loaded. Check your environment/GPU."
-    if not text.strip():
-        return None, "Error: Please enter some text."
-    if ref_audio is None:
-        return None, "Error: Please upload a reference audio file for voice cloning."
-    # Set seed
-    actual_seed = set_seed(int(seed))
-    # Chunk the text
-    chunks = split_text(text)
-    total_chunks = len(chunks)
-    if total_chunks == 0:
-        return None, "Error: No valid text to process."
-    all_wavs = []
-    try:
         for i, chunk in enumerate(chunks):
-            progress((i / total_chunks), desc=f"Processing chunk {i+1}/{total_chunks}")
-            # Generate audio for this chunk
-            # Chatterbox.generate expects: text, audio_prompt_path, exaggeration, temperature, cfg_weight, etc.
-            wav = model.generate(
                 chunk,
                 audio_prompt_path=ref_audio,
                 exaggeration=exaggeration,
@@ -112,125 +101,60 @@ def generate_tts(model, text, ref_audio, exaggeration, cfg_weight, temperature,
                 cfg_weight=cfg_weight
             )
-            # wav is usually a torch tensor [1, T] or [T]
             if wav.dim() == 1:
                 wav = wav.unsqueeze(0)
             all_wavs.append(wav.cpu())
-        # Concatenate all audio chunks along the time dimension (last dim)
-        if not all_wavs:
-            return None, "Error: No audio was generated."
         final_wav = torch.cat(all_wavs, dim=-1)
-        # Save to a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            output_path = tmp_file.name
-            torchaudio.save(output_path, final_wav, model.sr)
-        return output_path, f"Successfully generated audio with seed {actual_seed}. Total chunks: {total_chunks}."
     except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return None, f"Error during generation: {str(e)}"
-# Define the Gradio Interface
-def create_ui():
-    # Model is loaded once and stored in state
-    model_state = gr.State(None)
-    with gr.Blocks(theme=gr.themes.Soft(), title="Chatterbox Voice Clone TTS") as demo:
-        gr.Markdown("# 🗣️ Voice Cloning TTS Chatterbox")
-        gr.Markdown("""
-        Clone any voice using a short reference audio clip. This application is optimized for long scripts
-        through intelligent sentence-based chunking and sequential processing.
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                text_input = gr.Textbox(
-                    label="Script",
-                    placeholder="Enter your long script here. The app will automatically handle chunking...",
-                    lines=10,
-                    value="Welcome to the Chatterbox voice cloning application. This tool allows you to generate high-quality speech from long scripts by automatically splitting them into manageable segments. Simply upload a reference audio clip of the voice you want to clone, and adjust the parameters to your liking."
-                )
-                ref_audio = gr.Audio(
-                    label="Reference Audio (Voice to Clone)",
-                    type="filepath",
-                    sources=["upload", "microphone"]
-                )
-                with gr.Row():
-                    exaggeration = gr.Slider(
-                        0.1, 1.0, value=0.5, step=0.05,
-                        label="Exaggeration",
-                        info="Default 0.5. Extreme values (>0.8) may be unstable."
-                    )
-                    cfg_weight = gr.Slider(
-                        0.0, 1.0, value=0.5, step=0.05,
-                        label="CFG/Pace",
-                        info="Control the pace and guidance scale."
-                    )
-                with gr.Accordion("Advanced Options", open=False):
-                    seed = gr.Number(
-                        label="Seed",
-                        value=0,
-                        precision=0,
-                        info="Set to 0 for random seed each time."
-                    )
-                    temperature = gr.Slider(
-                        0.1, 2.0, value=1.0, step=0.05,
-                        label="Temperature",
-                        info="Higher values increase randomness and expressiveness."
-                    )
-                generate_btn = gr.Button("Generate Audio", variant="primary")
-            with gr.Column(scale=1):
-                audio_output = gr.Audio(label="Generated Speech", type="filepath")
-                status_msg = gr.Textbox(label="Status", interactive=False)
-                gr.Markdown("### 📖 Documentation")
-                gr.Markdown("""
-                ### Features
-                - **Voice Cloning**: Provide a clear 5-10 second reference clip.
-                - **Intelligent Chunking**: Scripts are split at sentence boundaries (approx. 250 chars) to ensure smooth transitions and avoid memory issues.
-                - **Sequential Processing**: Audio chunks are generated one-by-one and concatenated for long-form content.
-                - **Parameter Control**:
-                  - **Exaggeration**: Intensity of cloned voice traits.
-                  - **CFG/Pace**: Balance between text adherence and reference voice speed.
-                  - **Temperature**: Randomness of the output.
-                ### Tips
-                - Use a high-quality, noise-free reference audio for best results.
-                - For dramatic speech, try higher **Exaggeration** and lower **CFG**.
-                - If the output sounds unnatural, try a different **Seed** or adjust **Temperature**.
-                """)
-        # Event handling
-        generate_btn.click(
-            fn=generate_tts,
-            inputs=[
-                model_state,
-                text_input,
-                ref_audio,
-                exaggeration,
-                cfg_weight,
-                temperature,
-                seed
-            ],
-            outputs=[audio_output, status_msg]
-        )
-        # Load model on startup
-        demo.load(fn=load_model, outputs=model_state)
-    return demo
 if __name__ == "__main__":
-    ui = create_ui()
-    # Use server_name="0.0.0.0" for deployment compatibility
-    ui.launch(server_name="0.0.0.0")

 import os
 import random
+import re
+import tempfile
 import torch
 import torchaudio
+import numpy as np
 import gradio as gr
 from chatterbox.tts import ChatterboxTTS
+# Constants
+MAX_CHUNK_CHARS = 250
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+class VoiceCloningEngine:
     """
+    A dedicated engine to handle Chatterbox TTS operations including
+    model management, text chunking, and audio generation.
     """
+    def __init__(self, device=DEFAULT_DEVICE):
+        self.device = device
+        self.model = None
+        self.sr = 24000 # Default Chatterbox SR
+    def load_model(self):
+        """Loads the model into memory if not already present."""
+        if self.model is None:
+            print(f"Loading Chatterbox TTS on {self.device}...")
+            self.model = ChatterboxTTS.from_pretrained(self.device)
+            self.sr = self.model.sr
+        return self.model
+    def set_seed(self, seed: int):
+        """Sets deterministic seeds for reproducibility."""
+        if seed == 0:
+            seed = random.randint(1, 1000000)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+        return seed
+    def chunk_text(self, text):
+        """
+        Splits text into chunks at sentence boundaries for long script handling.
+        """
+        # Split by punctuation followed by space
+        sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) <= MAX_CHUNK_CHARS:
+                current_chunk += (sentence + " ")
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                # Handle single sentences longer than MAX_CHUNK_CHARS
+                if len(sentence) > MAX_CHUNK_CHARS:
+                    sub_parts = re.split(r'(?<=,)\s+|\s+', sentence)
+                    temp = ""
+                    for part in sub_parts:
+                        if len(temp) + len(part) <= MAX_CHUNK_CHARS:
+                            temp += (part + " ")
+                        else:
+                            if temp: chunks.append(temp.strip())
+                            temp = part + " "
+                    current_chunk = temp
+                else:
+                    current_chunk = sentence + " "
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    def generate(self, text, ref_audio, exaggeration, cfg_weight, temperature, seed, progress=None):
+        """
+        Processes the full script by chunking and concatenating results.
+        """
+        self.load_model()
+        actual_seed = self.set_seed(int(seed))
+        chunks = self.chunk_text(text)
+        if not chunks:
+            raise ValueError("No valid text provided.")
+        if ref_audio is None:
+            raise ValueError("Reference audio is required for cloning.")
+        all_wavs = []
         for i, chunk in enumerate(chunks):
+            if progress:
+                progress((i / len(chunks)), desc=f"Processing chunk {i+1}/{len(chunks)}")
+            wav = self.model.generate(
                 chunk,
                 audio_prompt_path=ref_audio,
                 exaggeration=exaggeration,
                 cfg_weight=cfg_weight
             )
             if wav.dim() == 1:
                 wav = wav.unsqueeze(0)
             all_wavs.append(wav.cpu())
         final_wav = torch.cat(all_wavs, dim=-1)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            output_path = tmp.name
+            torchaudio.save(output_path, final_wav, self.sr)
+        return output_path, actual_seed
+# Initialize the engine
+engine = VoiceCloningEngine()
+def process_tts(text, ref_audio, exaggeration, cfg_weight, temperature, seed, progress=gr.Progress()):
+    try:
+        path, used_seed = engine.generate(text, ref_audio, exaggeration, cfg_weight, temperature, seed, progress)
+        return path, f"Success! Seed used: {used_seed}"
     except Exception as e:
+        return None, f"Error: {str(e)}"
+# UI Construction
+with gr.Blocks(theme=gr.themes.Soft(), title="Chatterbox Voice Clone") as demo:
+    gr.Markdown("# 🗣️ Voice Cloning TTS Engine")
+    gr.Markdown("Optimized for long scripts with intelligent chunking and smooth concatenation.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            text_input = gr.Textbox(label="Script", lines=8, placeholder="Enter long text here...")
+            ref_audio = gr.Audio(label="Reference Voice", type="filepath")
+            with gr.Row():
+                exag = gr.Slider(0.1, 1.0, value=0.5, label="Exaggeration", info="Warning: >0.8 can be unstable")
+                cfg = gr.Slider(0.0, 1.0, value=0.5, label="CFG/Pace")
+            with gr.Accordion("Advanced Options", open=False):
+                seed_val = gr.Number(label="Seed", value=0, precision=0, info="0 for random")
+                temp_val = gr.Slider(0.1, 2.0, value=1.0, label="Temperature")
+            btn = gr.Button("Generate", variant="primary")
+        with gr.Column(scale=1):
+            audio_out = gr.Audio(label="Generated Audio", type="filepath")
+            status = gr.Textbox(label="Status", interactive=False)
+            gr.Markdown("### 📖 Quick Guide")
+            gr.Markdown("""
+            - **Chunking**: Sentences are automatically split at ~250 chars.
+            - **Secrets**: Use HF Secrets for API keys if needed.
+            - **Pacing**: Lower CFG for slower, more deliberate speech.
+            """)
+    btn.click(process_tts, [text_input, ref_audio, exag, cfg, temp_val, seed_val], [audio_out, status])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0")