Chatterbox

Runtime error

App Files Files Community

peterlllmm commited on Sep 23, 2025

Commit

ecc440e

verified ·

1 Parent(s): 2492250

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -221

app.py CHANGED Viewed

@@ -1,24 +1,13 @@
-import random
 import numpy as np
 import torch
 from chatterbox.src.chatterbox.tts import ChatterboxTTS
 import gradio as gr
 import spaces
-import re
-from typing import List, Tuple
-# Force CPU usage and patch torch.load to handle CUDA tensors on CPU
-DEVICE = "cpu"  # Force CPU since you don't have GPU access
 print(f"🚀 Running on device: {DEVICE}")
-# Patch torch.load to automatically map CUDA tensors to CPU
-original_load = torch.load
-def patched_load(f, map_location=None, **kwargs):
-    if map_location is None:
-        map_location = 'cpu'  # Always map to CPU
-    return original_load(f, map_location=map_location, **kwargs)
-torch.load = patched_load
 # --- Global Model Initialization ---
 MODEL = None
@@ -29,43 +18,12 @@ def get_or_load_model():
     if MODEL is None:
         print("Model not loaded, initializing...")
         try:
-            # Try multiple loading strategies for CPU
-            print("Attempting to load model on CPU...")
-            # Strategy 1: Direct CPU loading
-            try:
-                MODEL = ChatterboxTTS.from_pretrained("cpu")
-                print("✅ Model loaded successfully with direct CPU method")
-            except Exception as e1:
-                print(f"Direct CPU loading failed: {e1}")
-                # Strategy 2: Try with explicit map_location if supported
-                try:
-                    MODEL = ChatterboxTTS.from_pretrained(DEVICE, map_location='cpu')
-                    print("✅ Model loaded successfully with map_location method")
-                except Exception as e2:
-                    print(f"map_location method failed: {e2}")
-                    # Strategy 3: Load with default then move to CPU
-                    try:
-                        MODEL = ChatterboxTTS.from_pretrained()
-                        if hasattr(MODEL, 'to'):
-                            MODEL = MODEL.to('cpu')
-                        print("✅ Model loaded successfully with default then CPU move")
-                    except Exception as e3:
-                        print(f"All loading strategies failed. Last error: {e3}")
-                        raise e3
-            # Ensure model is on CPU
-            if hasattr(MODEL, 'to'):
-                MODEL = MODEL.to('cpu')
-            if hasattr(MODEL, 'device'):
-                print(f"Model device: {MODEL.device}")
-            print(f"Model loaded successfully on CPU")
         except Exception as e:
-            print(f"CRITICAL: All model loading attempts failed: {e}")
             raise
     return MODEL
@@ -78,149 +36,81 @@ except Exception as e:
 def set_seed(seed: int):
     """Sets the random seed for reproducibility across torch, numpy, and random."""
     torch.manual_seed(seed)
-    # Remove CUDA seed setting since we're on CPU only
     random.seed(seed)
     np.random.seed(seed)
-def intelligent_text_chunking(text: str, max_chunk_size: int = 250) -> List[str]:
     """
-    Split text into chunks intelligently, preserving sentence boundaries and meaning.
     Args:
-        text (str): The input text to chunk
-        max_chunk_size (int): Maximum characters per chunk (default 250 for safety margin)
     Returns:
-        List[str]: List of text chunks
     """
     if len(text) <= max_chunk_size:
         return [text]
     chunks = []
-    # First, split by paragraphs
-    paragraphs = text.split('\n\n')
-    current_chunk = ""
-    for paragraph in paragraphs:
-        # If the paragraph itself is too long, split by sentences
-        if len(paragraph) > max_chunk_size:
-            sentences = re.split(r'(?<=[.!?])\s+', paragraph)
-            for sentence in sentences:
-                # If even a single sentence is too long, split by clauses
-                if len(sentence) > max_chunk_size:
-                    clauses = re.split(r'(?<=[,;:])\s+', sentence)
-                    for clause in clauses:
-                        # If clause is still too long, force split at word boundaries
-                        if len(clause) > max_chunk_size:
-                            words = clause.split()
-                            temp_chunk = ""
-                            for word in words:
-                                if len(temp_chunk + " " + word) <= max_chunk_size:
-                                    temp_chunk += (" " + word) if temp_chunk else word
-                                else:
-                                    if temp_chunk:
-                                        chunks.append(temp_chunk.strip())
-                                    temp_chunk = word
-                            if temp_chunk:
-                                if len(current_chunk + " " + temp_chunk) <= max_chunk_size:
-                                    current_chunk += (" " + temp_chunk) if current_chunk else temp_chunk
-                                else:
-                                    if current_chunk:
-                                        chunks.append(current_chunk.strip())
-                                    current_chunk = temp_chunk
-                        else:
-                            # Add clause to current chunk if it fits
-                            if len(current_chunk + " " + clause) <= max_chunk_size:
-                                current_chunk += (" " + clause) if current_chunk else clause
-                            else:
-                                if current_chunk:
-                                    chunks.append(current_chunk.strip())
-                                current_chunk = clause
-                else:
-                    # Add sentence to current chunk if it fits
-                    if len(current_chunk + " " + sentence) <= max_chunk_size:
-                        current_chunk += (" " + sentence) if current_chunk else sentence
-                    else:
-                        if current_chunk:
-                            chunks.append(current_chunk.strip())
-                        current_chunk = sentence
-        else:
-            # Add paragraph to current chunk if it fits
-            if len(current_chunk + "\n\n" + paragraph) <= max_chunk_size:
-                current_chunk += ("\n\n" + paragraph) if current_chunk else paragraph
             else:
-                if current_chunk:
-                    chunks.append(current_chunk.strip())
-                current_chunk = paragraph
-    # Add any remaining text
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return [chunk for chunk in chunks if chunk.strip()]
-def concatenate_audio_chunks(audio_chunks: List[Tuple[int, np.ndarray]],
-                           silence_duration: float = 0.3) -> Tuple[int, np.ndarray]:
-    """
-    Concatenate multiple audio chunks with silence between them.
-    Args:
-        audio_chunks: List of (sample_rate, audio_array) tuples
-        silence_duration: Duration of silence between chunks in seconds
-    Returns:
-        Tuple[int, np.ndarray]: Combined (sample_rate, audio_array)
-    """
-    if not audio_chunks:
-        return None
-    sample_rate = audio_chunks[0][0]
-    silence_samples = int(sample_rate * silence_duration)
-    silence = np.zeros(silence_samples, dtype=audio_chunks[0][1].dtype)
-    combined_audio = []
-    for i, (sr, audio) in enumerate(audio_chunks):
-        combined_audio.append(audio)
-        # Add silence between chunks (but not after the last one)
-        if i < len(audio_chunks) - 1:
-            combined_audio.append(silence)
-    return sample_rate, np.concatenate(combined_audio)
-@spaces.GPU  # This decorator might not work on CPU, but keeping it for compatibility
-def generate_tts_audio_chunked(
     text_input: str,
     audio_prompt_path_input: str = None,
     exaggeration_input: float = 0.5,
     temperature_input: float = 0.8,
     seed_num_input: int = 0,
     cfgw_input: float = 0.5,
-    chunk_size: int = 250,
-    silence_between_chunks: float = 0.3
 ) -> tuple[int, np.ndarray]:
     """
-    Generate high-quality speech audio from text using ChatterboxTTS model with intelligent chunking.
-    This tool synthesizes natural-sounding speech from input text of any length by intelligently
-    splitting long text into chunks. When a reference audio file is provided, it captures the
-    speaker's voice characteristics and speaking style. The generated audio maintains consistency
-    across chunks while avoiding hallucination issues.
     Args:
-        text_input (str): The text to synthesize into speech (any length)
-        audio_prompt_path_input (str, optional): File path or URL to the reference audio file. Defaults to None.
-        exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0). Defaults to 0.5.
-        temperature_input (float, optional): Controls randomness in generation (0.05-5.0). Defaults to 0.8.
-        seed_num_input (int, optional): Random seed for reproducible results (0 for random). Defaults to 0.
         cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
-        chunk_size (int, optional): Maximum characters per chunk. Defaults to 250.
-        silence_between_chunks (float, optional): Silence duration between chunks in seconds. Defaults to 0.3.
     Returns:
         tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
@@ -230,20 +120,12 @@ def generate_tts_audio_chunked(
     if current_model is None:
         raise RuntimeError("TTS model is not loaded.")
-    if not text_input.strip():
-        raise ValueError("Text input cannot be empty.")
     if seed_num_input != 0:
         set_seed(int(seed_num_input))
-    print(f"Processing text of {len(text_input)} characters")
-    # Split text into intelligent chunks
-    text_chunks = intelligent_text_chunking(text_input, chunk_size)
-    print(f"Split into {len(text_chunks)} chunks")
-    # Generate audio for each chunk
-    audio_chunks = []
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
@@ -253,47 +135,49 @@ def generate_tts_audio_chunked(
     if audio_prompt_path_input:
         generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
     for i, chunk in enumerate(text_chunks):
-        print(f"Generating audio for chunk {i+1}/{len(text_chunks)}: '{chunk[:50]}...'")
-        try:
-            wav = current_model.generate(chunk, **generate_kwargs)
-            audio_chunks.append((current_model.sr, wav.squeeze(0).numpy()))
-        except Exception as e:
-            print(f"Error generating audio for chunk {i+1}: {e}")
-            # Continue with remaining chunks instead of failing completely
-            continue
-    if not audio_chunks:
-        raise RuntimeError("Failed to generate audio for any chunks.")
-    # Concatenate all audio chunks
-    print("Concatenating audio chunks...")
-    final_sample_rate, final_audio = concatenate_audio_chunks(audio_chunks, silence_between_chunks)
-    print(f"Audio generation complete. Total duration: {len(final_audio) / final_sample_rate:.2f} seconds")
-    return (final_sample_rate, final_audio)
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        # Chatterbox TTS Demo with Intelligent Chunking
-        Generate high-quality speech from text of any length with reference audio styling.
-        **Features:**
-        - ✅ No character limit - process text of any length
-        - ✅ Intelligent chunking preserves sentence boundaries
-        - ✅ Consistent voice across chunks
-        - ✅ Prevents hallucination through proper segmentation
         """
     )
     with gr.Row():
         with gr.Column():
             text = gr.Textbox(
-                value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible. This is just the beginning of our culinary adventure. We're going to explore flavors that have never been combined before, creating a symphony of taste that will revolutionize the way we think about cooking.",
-                label="Text to synthesize (any length)",
-                max_lines=10,
-                lines=5
             )
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
@@ -302,36 +186,23 @@ with gr.Blocks() as demo:
                 value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
             )
             exaggeration = gr.Slider(
-                0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5)", value=.5
             )
             cfg_weight = gr.Slider(
                 0.2, 1, step=.05, label="CFG/Pace", value=0.5
             )
-            with gr.Accordion("Advanced options", open=False):
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
-                chunk_size = gr.Slider(150, 300, step=10, label="Chunk size (characters)", value=250)
-                silence_duration = gr.Slider(0.1, 1.0, step=0.1, label="Silence between chunks (seconds)", value=0.3)
             run_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Output Audio")
-            with gr.Row():
-                gr.Markdown(
-                    """
-                    **Tips:**
-                    - Longer texts are automatically split into chunks at natural boundaries (sentences, clauses)
-                    - Adjust chunk size if you notice quality issues
-                    - Increase silence duration for clearer separation between chunks
-                    - Use consistent reference audio for better voice continuity
-                    """
-                )
     run_btn.click(
-        fn=generate_tts_audio_chunked,
         inputs=[
             text,
             ref_wav,
@@ -339,8 +210,6 @@ with gr.Blocks() as demo:
             temp,
             seed_num,
             cfg_weight,
-            chunk_size,
-            silence_duration,
         ],
         outputs=[audio_output],
     )

+ import random
 import numpy as np
 import torch
 from chatterbox.src.chatterbox.tts import ChatterboxTTS
 import gradio as gr
 import spaces
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Running on device: {DEVICE}")
 # --- Global Model Initialization ---
 MODEL = None
     if MODEL is None:
         print("Model not loaded, initializing...")
         try:
+            MODEL = ChatterboxTTS.from_pretrained(DEVICE)
+            if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
+                MODEL.to(DEVICE)
+            print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
         except Exception as e:
+            print(f"Error loading model: {e}")
             raise
     return MODEL
 def set_seed(seed: int):
     """Sets the random seed for reproducibility across torch, numpy, and random."""
     torch.manual_seed(seed)
+    if DEVICE == "cuda":
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
     random.seed(seed)
     np.random.seed(seed)
+def chunk_text(text: str, max_chunk_size: int = 300, overlap: int = 50) -> list[str]:
     """
+    Split text into chunks with optional overlap for better continuity.
     Args:
+        text (str): The text to chunk
+        max_chunk_size (int): Maximum characters per chunk
+        overlap (int): Number of characters to overlap between chunks
     Returns:
+        list[str]: List of text chunks
     """
     if len(text) <= max_chunk_size:
         return [text]
     chunks = []
+    start = 0
+    while start < len(text):
+        end = start + max_chunk_size
+        # If this isn't the last chunk, try to break at a sentence or word boundary
+        if end < len(text):
+            # Look for sentence endings first
+            last_sentence = text.rfind('.', start, end)
+            if last_sentence == -1:
+                last_sentence = text.rfind('!', start, end)
+            if last_sentence == -1:
+                last_sentence = text.rfind('?', start, end)
+            # If no sentence boundary, look for word boundary
+            if last_sentence == -1:
+                last_space = text.rfind(' ', start, end)
+                if last_space != -1:
+                    end = last_space
             else:
+                end = last_sentence + 1
+        chunks.append(text[start:end].strip())
+        start = end - overlap if end < len(text) else end
+    return chunks
+@spaces.GPU
+def generate_tts_audio(
     text_input: str,
     audio_prompt_path_input: str = None,
     exaggeration_input: float = 0.5,
     temperature_input: float = 0.8,
     seed_num_input: int = 0,
     cfgw_input: float = 0.5,
+    chunk_size: int = 300
 ) -> tuple[int, np.ndarray]:
     """
+    Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
+    For long texts, automatically chunks the input for better processing.
+    This tool synthesizes natural-sounding speech from input text. When a reference audio file
+    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
+    maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
+        text_input (str): The text to synthesize into speech
+        audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
+        exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
+        temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
+        seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
         cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
+        chunk_size (int, optional): Maximum characters per chunk for long texts. Defaults to 300.
     Returns:
         tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
     if current_model is None:
         raise RuntimeError("TTS model is not loaded.")
     if seed_num_input != 0:
         set_seed(int(seed_num_input))
+    print(f"Generating audio for text: '{text_input[:50]}...' (Length: {len(text_input)} chars)")
+    # Handle optional audio prompt
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
     if audio_prompt_path_input:
         generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
+    # Chunk the text if it's longer than chunk_size
+    text_chunks = chunk_text(text_input, chunk_size)
+    print(f"Processing {len(text_chunks)} chunk(s)")
+    # Generate audio for each chunk
+    audio_segments = []
+    sample_rate = None
     for i, chunk in enumerate(text_chunks):
+        print(f"Processing chunk {i+1}/{len(text_chunks)}: '{chunk[:30]}...'")
+        wav = current_model.generate(
+            chunk,
+            **generate_kwargs
+        )
+        if sample_rate is None:
+            sample_rate = current_model.sr
+        audio_segments.append(wav.squeeze(0).numpy())
+    # Concatenate all audio segments
+    if len(audio_segments) == 1:
+        final_audio = audio_segments[0]
+    else:
+        final_audio = np.concatenate(audio_segments, axis=0)
+    print("Audio generation complete.")
+    return (sample_rate, final_audio)
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        # Chatterbox TTS Demo
+        Generate high-quality speech from text with reference audio styling.
         """
     )
     with gr.Row():
         with gr.Column():
             text = gr.Textbox(
+                value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
+                label="Text to synthesize",  # Removed "max chars 300" from label
+                max_lines=5
             )
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
                 value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
             )
             exaggeration = gr.Slider(
+                0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
             )
             cfg_weight = gr.Slider(
                 0.2, 1, step=.05, label="CFG/Pace", value=0.5
             )
+            with gr.Accordion("More options", open=False):
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
             run_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Output Audio")
     run_btn.click(
+        fn=generate_tts_audio,
         inputs=[
             text,
             ref_wav,
             temp,
             seed_num,
             cfg_weight,
         ],
         outputs=[audio_output],
     )