Chatterbox

Runtime error

App Files Files Community

peterlllmm commited on 26 days ago

Commit

3ff6b5d

verified ·

1 Parent(s): d4020c8

Update app.py

Browse files

Files changed (1) hide show

app.py +291 -81

app.py CHANGED Viewed

@@ -1,211 +1,421 @@
 import nltk
-nltk.download("punkt", quiet=True)
 import random
 import numpy as np
 import torch
 import io
 import os
 import soundfile as sf
 from nltk.tokenize import sent_tokenize
-from pydub import AudioSegment, silence
 import gradio as gr
 from chatterbox.src.chatterbox.tts import ChatterboxTTS
 # ===============================
 # DEVICE
 # ===============================
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"🚀 Running on device: {DEVICE}")
 # ===============================
 # LOAD MODEL ONCE
 # ===============================
 MODEL = None
 def get_model():
     global MODEL
     if MODEL is None:
         print("Loading Chatterbox model...")
         MODEL = ChatterboxTTS.from_pretrained(DEVICE)
-        if hasattr(MODEL, "to") and str(MODEL.device) != DEVICE:
             MODEL.to(DEVICE)
-        print("✅ Model ready.")
     return MODEL
 get_model()
 # ===============================
 # SEED
 # ===============================
 def set_seed(seed):
     torch.manual_seed(seed)
     if DEVICE == "cuda":
-        torch.cuda.manual_seed(seed)
         torch.cuda.manual_seed_all(seed)
     random.seed(seed)
     np.random.seed(seed)
 # ===============================
-# UNLIMITED CHUNKING SETTINGS
 # ===============================
-MAX_CHARS = 250
-SILENCE_MS = 350   # 350ms breath between chunks to help slow the pacing naturally
-CROSSFADE_MS = 25
 # ===============================
 # MAIN TTS FUNCTION
 # ===============================
 def generate_tts(
     text,
     ref_audio=None,
-    exaggeration=0.5,
-    temperature=0.8,
     seed=0,
-    cfg_weight=0.5,
-    vad_trim=True
 ):
     model = get_model()
     if seed != 0:
         set_seed(int(seed))
-    # --------------------------------
-    # HF Official Kwargs (Speed parameter removed to stop crash)
-    # --------------------------------
-    generate_kwargs = {
         "exaggeration": exaggeration,
         "temperature": temperature,
         "cfg_weight": cfg_weight,
     }
     temp_prompt = None
     if ref_audio:
         try:
             audio = AudioSegment.from_file(ref_audio)
-            # MANUAL REF VAD TRIMMING
-            if vad_trim:
-                print("✂️ Sanitizing reference audio...")
-                non_silent_ranges = silence.detect_nonsilent(audio, min_silence_len=100, silence_thresh=-45)
-                if non_silent_ranges:
-                    start_trim = non_silent_ranges[0][0]
-                    end_trim = non_silent_ranges[-1][1]
-                    audio = audio[start_trim:end_trim]
             temp_prompt = "voice_prompt.wav"
             audio.export(temp_prompt, format="wav")
-            generate_kwargs["audio_prompt_path"] = temp_prompt
-        except Exception as e:
-            print(f"⚠️ Reference audio failed: {e} — using default voice.")
     # --------------------------------
     # Sentence chunking
     # --------------------------------
     sentences = sent_tokenize(text)
     chunks = []
     current = ""
     for s in sentences:
         if len(current) + len(s) < MAX_CHARS:
             current += " " + s
         else:
-            if current.strip():
-                chunks.append(current.strip())
             current = s
     if current.strip():
         chunks.append(current.strip())
-    print(f"\n📝 Total unlimited chunks: {len(chunks)}")
     # --------------------------------
     # Generate audio per chunk
     # --------------------------------
     final_audio = AudioSegment.empty()
     clean_pause = AudioSegment.silent(duration=SILENCE_MS)
     for i, chunk in enumerate(chunks):
-        print(f"➡️ Generating chunk [{i+1}/{len(chunks)}]: {chunk[:50]}...")
-        wav = model.generate(chunk, **generate_kwargs)
         wav_np = wav.squeeze(0).cpu().numpy()
         buffer = io.BytesIO()
         sf.write(buffer, wav_np, model.sr, format="WAV")
         buffer.seek(0)
         segment = AudioSegment.from_wav(buffer)
-        if vad_trim:
-             out_silent = silence.detect_nonsilent(segment, min_silence_len=100, silence_thresh=-45)
-             if out_silent:
-                 segment = segment[:out_silent[-1][1] + 50]
-        if len(final_audio) > 0:
-            final_audio = final_audio.append(segment, crossfade=CROSSFADE_MS)
-        else:
-            final_audio = segment
-        final_audio += clean_pause
     # --------------------------------
     # Export
     # --------------------------------
-    output_path = "story_voice_clean.mp3"
     final_audio.export(output_path, format="mp3", bitrate="192k")
     if temp_prompt and os.path.exists(temp_prompt):
         os.remove(temp_prompt)
-    print(f"✅ Success! Audio saved to {output_path}")
     return output_path
 # ===============================
 # GRADIO UI
 # ===============================
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🎙️ Chatterbox TTS (Stable Official Backend)")
-    with gr.Row():
-        with gr.Column():
-            text = gr.Textbox(
-                label="Story Text",
-                lines=10,
-                placeholder="Paste your script here. To slow down the pacing, use ellipses (...) and extra commas."
-            )
-            ref = gr.Audio(
-                sources=["upload", "microphone"],
-                type="filepath",
-                label="Reference Voice (Golden 3-5s clip is best)"
-            )
-            with gr.Accordion("⚙️ Engine Settings (Synced to HF Defaults)", open=True):
-                exaggeration = gr.Slider(0.25, 2.0, value=0.5, step=0.05, label="Exaggeration (Neutral = 0.5)")
-                cfg = gr.Slider(0.2, 1.0, value=0.5, step=0.05, label="CFG / Pace Weight")
-                temperature = gr.Slider(0.05, 5.0, value=0.8, step=0.05, label="Temperature")
-                with gr.Row():
-                    vad_toggle = gr.Checkbox(value=True, label="Ref VAD Trimming (Kills artifacts)")
-                    seed = gr.Number(value=0, label="Seed (0 = random)")
-            btn = gr.Button("⚡ Generate Unlimited Voice", variant="primary")
-        with gr.Column():
-            out = gr.Audio(label="Final Merged Audio")
     btn.click(
         fn=generate_tts,
-        inputs=[text, ref, exaggeration, temperature, seed, cfg, vad_toggle],
         outputs=out
     )
-print("\n" + "=" * 60)
-print("🔗 Launching Chatterbox Stable...")
-print("=" * 60 + "\n")
 demo.launch(share=True)

 import nltk
+nltk.download("punkt")
 import random
 import numpy as np
 import torch
 import io
 import os
 import soundfile as sf
 from nltk.tokenize import sent_tokenize
+from pydub import AudioSegment, silence  # Added silence module
 import gradio as gr
 from chatterbox.src.chatterbox.tts import ChatterboxTTS
 # ===============================
 # DEVICE
 # ===============================
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Running on: {DEVICE}")
 # ===============================
 # LOAD MODEL ONCE
 # ===============================
 MODEL = None
 def get_model():
     global MODEL
     if MODEL is None:
         print("Loading Chatterbox model...")
         MODEL = ChatterboxTTS.from_pretrained(DEVICE)
+        if hasattr(MODEL, "to"):
             MODEL.to(DEVICE)
+        print("Model ready.")
     return MODEL
 get_model()
 # ===============================
 # SEED
 # ===============================
 def set_seed(seed):
     torch.manual_seed(seed)
     if DEVICE == "cuda":
         torch.cuda.manual_seed_all(seed)
     random.seed(seed)
     np.random.seed(seed)
+# ===============================
+# PODCAST SAFE SETTINGS
+# ===============================
+MAX_CHARS = 220
+SILENCE_MS = 250   # Reduced slightly since we are cleaning audio
+FADE_IN = 10       # Reduced fade to avoid eating words
+FADE_OUT = 10      # Reduced fade to avoid weird half-breath sounds
 # ===============================
+# HELPER: TRIM SILENCE/BREATHS
 # ===============================
+def trim_audio_segment(audio_segment, silence_thresh=-40):
+    """
+    Trims silence or quiet breath sounds from the start and end of a chunk.
+    Adjust silence_thresh (dBFS) if it cuts off actual words.
+    """
+    # Detect non-silent chunks
+    non_silent_ranges = silence.detect_nonsilent(
+        audio_segment,
+        min_silence_len=100,
+        silence_thresh=silence_thresh
+    )
+    # If audio is completely silent or empty, return empty
+    if not non_silent_ranges:
+        return AudioSegment.empty()
+    # Get start of first sound and end of last sound
+    start_trim = non_silent_ranges[0][0]
+    end_trim = non_silent_ranges[-1][1]
+    return audio_segment[start_trim:end_trim]
 # ===============================
 # MAIN TTS FUNCTION
 # ===============================
 def generate_tts(
     text,
     ref_audio=None,
+    exaggeration=0.4,
+    temperature=0.7,
     seed=0,
+    cfg_weight=0.6,
 ):
     model = get_model()
     if seed != 0:
         set_seed(int(seed))
+    kwargs = {
         "exaggeration": exaggeration,
         "temperature": temperature,
         "cfg_weight": cfg_weight,
     }
+    # --------------------------------
+    # Handle reference voice
+    # --------------------------------
     temp_prompt = None
     if ref_audio:
         try:
             audio = AudioSegment.from_file(ref_audio)
             temp_prompt = "voice_prompt.wav"
             audio.export(temp_prompt, format="wav")
+            kwargs["audio_prompt_path"] = temp_prompt
+        except:
+            print("Reference audio failed — using default voice.")
     # --------------------------------
     # Sentence chunking
     # --------------------------------
     sentences = sent_tokenize(text)
     chunks = []
     current = ""
     for s in sentences:
         if len(current) + len(s) < MAX_CHARS:
             current += " " + s
         else:
+            chunks.append(current.strip())
             current = s
     if current.strip():
         chunks.append(current.strip())
+    print(f"Total chunks: {len(chunks)}")
     # --------------------------------
     # Generate audio per chunk
     # --------------------------------
     final_audio = AudioSegment.empty()
     clean_pause = AudioSegment.silent(duration=SILENCE_MS)
     for i, chunk in enumerate(chunks):
+        print(f"Generating chunk {i+1}/{len(chunks)}")
+        # 1. Generate Raw Audio
+        wav = model.generate(chunk, **kwargs)
         wav_np = wav.squeeze(0).cpu().numpy()
         buffer = io.BytesIO()
         sf.write(buffer, wav_np, model.sr, format="WAV")
         buffer.seek(0)
         segment = AudioSegment.from_wav(buffer)
+        # 2. TRIM ARTIFACTS (The Fix)
+        # We strip the "trailing breath" or silence from the model output
+        # BEFORE we add our own clean silence.
+        segment = trim_audio_segment(segment, silence_thresh=-45)
+        # 3. Apply light fade only after trimming
+        if len(segment) > 0:
+            segment = segment.fade_in(FADE_IN).fade_out(FADE_OUT)
+            final_audio += segment + clean_pause
     # --------------------------------
     # Export
     # --------------------------------
+    output_path = "story_voice.mp3"
     final_audio.export(output_path, format="mp3", bitrate="192k")
     if temp_prompt and os.path.exists(temp_prompt):
         os.remove(temp_prompt)
     return output_path
 # ===============================
 # GRADIO UI
 # ===============================
+with gr.Blocks() as demo:
+    gr.Markdown("## 🎙️ Storyteller / Podcast Chatterbox TTS (Cleaned)")
+    text = gr.Textbox(
+        label="Story Text",
+        lines=12,
+        placeholder="Paste your full story here..."
+    )
+    ref = gr.Audio(
+        sources=["upload", "microphone"],
+        type="filepath",
+        label="Reference Voice (optional)"
+    )
+    exaggeration = gr.Slider(0.25, 1.0, value=0.4, step=0.05, label="Emotion")
+    temperature = gr.Slider(0.3, 1.2, value=0.7, step=0.05, label="Variation")
+    cfg = gr.Slider(0.3, 1.0, value=0.6, step=0.05, label="Voice Stability")
+    seed = gr.Number(value=0, label="Seed (0 = random)")
+    btn = gr.Button("Generate Voice")
+    out = gr.Audio(label="Final Audio")
     btn.click(
         fn=generate_tts,
+        inputs=[text, ref, exaggeration, temperature, seed, cfg],
         outputs=out
     )
 demo.launch(share=True)