Chatterbox

Sleeping

App Files Files Community

oicui commited on Nov 25, 2025

Commit

6fc220a

verified ·

1 Parent(s): 1afd111

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -52

app.py CHANGED Viewed

@@ -1,40 +1,44 @@
 import random
 import numpy as np
 import torch
-from chatterbox.src.chatterbox.tts import ChatterboxTTS
 import gradio as gr
 import spaces
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Running on device: {DEVICE}")
-# --- Global Model Initialization ---
 MODEL = None
 def get_or_load_model():
-    """Loads the ChatterboxTTS model if it hasn't been loaded already,
-    and ensures it's on the correct device."""
     global MODEL
     if MODEL is None:
         print("Model not loaded, initializing...")
         try:
             MODEL = ChatterboxTTS.from_pretrained(DEVICE)
-            if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
                 MODEL.to(DEVICE)
-            print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
         except Exception as e:
             print(f"Error loading model: {e}")
             raise
     return MODEL
-# Attempt to load the model at startup.
 try:
     get_or_load_model()
 except Exception as e:
-    print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
 def set_seed(seed: int):
-    """Sets the random seed for reproducibility across torch, numpy, and random."""
     torch.manual_seed(seed)
     if DEVICE == "cuda":
         torch.cuda.manual_seed(seed)
@@ -42,6 +46,19 @@ def set_seed(seed: int):
     random.seed(seed)
     np.random.seed(seed)
 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
@@ -51,85 +68,120 @@ def generate_tts_audio(
     seed_num_input: int = 0,
     cfgw_input: float = 0.5,
     vad_trim_input: bool = False,
-) -> tuple[int, np.ndarray]:
-    """
-    Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
-    This tool synthesizes natural-sounding speech from input text. When a reference audio file
-    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
-    maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
-    Args:
-        text_input (str): The text to synthesize into speech (maximum 300 characters)
-        audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
-        exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
-        temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
-        seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
-        cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
-    Returns:
-        tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
-    """
-    current_model = get_or_load_model()
     if current_model is None:
         raise RuntimeError("TTS model is not loaded.")
-    if seed_num_input != 0:
-        set_seed(int(seed_num_input))
-    print(f"Generating audio for text: '{text_input[:50]}...'")
-    # Handle optional audio prompt
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
         "vad_trim": vad_trim_input,
     }
     if audio_prompt_path_input:
         generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
-    wav = current_model.generate(
-        text_input[:300],  # Truncate text to max chars
-        **generate_kwargs
-    )
     print("Audio generation complete.")
-    return (current_model.sr, wav.squeeze(0).numpy())
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        # Chatterbox TTS Demo
-        Generate high-quality speech from text with reference audio styling.
         """
     )
     with gr.Row():
         with gr.Column():
             text = gr.Textbox(
-                value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
-                label="Text to synthesize (max chars 300)",
-                max_lines=5
             )
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
                 type="filepath",
                 label="Reference Audio File (Optional)",
                 value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
             )
-            exaggeration = gr.Slider(
-                0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
-            )
-            cfg_weight = gr.Slider(
-                0.2, 1, step=.05, label="CFG/Pace", value=0.5
-            )
             with gr.Accordion("More options", open=False):
-                seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
                 vad_trim = gr.Checkbox(label="Ref VAD trimming", value=False)
             run_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
@@ -145,8 +197,13 @@ with gr.Blocks() as demo:
             seed_num,
             cfg_weight,
             vad_trim,
         ],
-        outputs=[audio_output],
     )
 demo.launch(mcp_server=True)

 import random
 import numpy as np
 import torch
 import gradio as gr
 import spaces
+from chatterbox.src.chatterbox.tts import ChatterboxTTS
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Running on device: {DEVICE}")
+# ---------------------------------------
+# GLOBAL MODEL LOAD
+# ---------------------------------------
 MODEL = None
 def get_or_load_model():
     global MODEL
     if MODEL is None:
         print("Model not loaded, initializing...")
         try:
             MODEL = ChatterboxTTS.from_pretrained(DEVICE)
+            if hasattr(MODEL, "to") and str(MODEL.device) != DEVICE:
                 MODEL.to(DEVICE)
+            print("Model loaded successfully.")
         except Exception as e:
             print(f"Error loading model: {e}")
             raise
     return MODEL
 try:
     get_or_load_model()
 except Exception as e:
+    print(f"CRITICAL startup load failed: {e}")
+# ---------------------------------------
+# UTILITIES
+# ---------------------------------------
 def set_seed(seed: int):
     torch.manual_seed(seed)
     if DEVICE == "cuda":
         torch.cuda.manual_seed(seed)
     random.seed(seed)
     np.random.seed(seed)
+def chunk_text(text: str, chunk_size: int):
+    return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
+def concat_audio(chunks):
+    if not chunks:
+        return None
+    return np.concatenate(chunks, axis=-1)
+# ---------------------------------------
+# MAIN TTS FUNCTION
+# ---------------------------------------
 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
     seed_num_input: int = 0,
     cfgw_input: float = 0.5,
     vad_trim_input: bool = False,
+    enable_chunking: bool = False,
+    chunk_size_value: int = 250,
+):
+    current_model = get_or_load_model()
     if current_model is None:
         raise RuntimeError("TTS model is not loaded.")
+    # -------------------------
+    #   SEED HANDLING
+    # -------------------------
+    if seed_num_input == 0:
+        used_seed = random.randint(1, 2**31 - 1)
+    else:
+        used_seed = int(seed_num_input)
+    print(f"Using seed: {used_seed}")
+    set_seed(used_seed)
+    print(f"Generating audio for text (preview): '{text_input[:50]}...'")
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
         "vad_trim": vad_trim_input,
     }
     if audio_prompt_path_input:
         generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
+    # -------------------------
+    #    CHUNK PROCESSING
+    # -------------------------
+    if enable_chunking:
+        print(f"Chunking enabled — chunk size = {chunk_size_value}")
+        text_chunks = chunk_text(text_input, int(chunk_size_value))
+    else:
+        text_chunks = [text_input]
+    audio_segments = []
+    for i, chunk in enumerate(text_chunks):
+        print(f"Rendering chunk {i+1}/{len(text_chunks)}...")
+        wav = current_model.generate(chunk, **generate_kwargs)
+        audio_segments.append(wav.squeeze(0).numpy())
+    final_audio = concat_audio(audio_segments)
     print("Audio generation complete.")
+    return current_model.sr, final_audio, used_seed
+# ---------------------------------------
+# UI
+# ---------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        # Chatterbox TTS Demo — Enhanced Version
+        Supports unlimited text, chunking & random seed viewer.
         """
     )
     with gr.Row():
         with gr.Column():
+            # MAIN TEXT
             text = gr.Textbox(
+                value="Now let's make my mum's favourite...",
+                label="Text to synthesize",
+                max_lines=10
             )
+            # REFERENCE AUDIO
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
                 type="filepath",
                 label="Reference Audio File (Optional)",
                 value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
             )
+            exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration", value=.5)
+            cfg_weight = gr.Slider(0.2, 1, step=.05, label="CFG/Pace", value=0.5)
+            # ADVANCED OPTIONS
             with gr.Accordion("More options", open=False):
+                seed_num = gr.Number(value=0, label="Random seed (0 = random)")
+                # NEW — SEED DISPLAY (READ ONLY)
+                seed_display = gr.Textbox(
+                    value="",
+                    label="Seed Used (auto-filled)",
+                    interactive=False
+                )
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
                 vad_trim = gr.Checkbox(label="Ref VAD trimming", value=False)
+                # NEW — ENABLE CHUNKING
+                enable_chunking = gr.Checkbox(
+                    label="Enable Text Chunking (split long text)",
+                    value=False
+                )
+                # NEW — CHUNK SIZE SLIDER
+                chunk_size = gr.Slider(
+                    minimum=100,
+                    maximum=300,
+                    value=250,
+                    step=10,
+                    label="Chunk Size (characters) — Text chunking for long conversations"
+                )
             run_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             seed_num,
             cfg_weight,
             vad_trim,
+            enable_chunking,
+            chunk_size,
+        ],
+        outputs=[
+            audio_output,
+            seed_display,   # NEW: seed returned to UI
         ],
     )
 demo.launch(mcp_server=True)