import nltk nltk.download("punkt") import random import numpy as np import torch import io import os import soundfile as sf from nltk.tokenize import sent_tokenize from pydub import AudioSegment, silence # Added silence module import gradio as gr from chatterbox.src.chatterbox.tts import ChatterboxTTS # =============================== # DEVICE # =============================== DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"Running on: {DEVICE}") # =============================== # LOAD MODEL ONCE # =============================== MODEL = None def get_model(): global MODEL if MODEL is None: print("Loading Chatterbox model...") MODEL = ChatterboxTTS.from_pretrained(DEVICE) if hasattr(MODEL, "to"): MODEL.to(DEVICE) print("Model ready.") return MODEL get_model() # =============================== # SEED # =============================== def set_seed(seed): torch.manual_seed(seed) if DEVICE == "cuda": torch.cuda.manual_seed_all(seed) random.seed(seed) np.random.seed(seed) # =============================== # PODCAST SAFE SETTINGS # =============================== MAX_CHARS = 220 SILENCE_MS = 250 # Reduced slightly since we are cleaning audio FADE_IN = 10 # Reduced fade to avoid eating words FADE_OUT = 10 # Reduced fade to avoid weird half-breath sounds # =============================== # HELPER: TRIM SILENCE/BREATHS # =============================== def trim_audio_segment(audio_segment, silence_thresh=-40): """ Trims silence or quiet breath sounds from the start and end of a chunk. Adjust silence_thresh (dBFS) if it cuts off actual words. """ # Detect non-silent chunks non_silent_ranges = silence.detect_nonsilent( audio_segment, min_silence_len=100, silence_thresh=silence_thresh ) # If audio is completely silent or empty, return empty if not non_silent_ranges: return AudioSegment.empty() # Get start of first sound and end of last sound start_trim = non_silent_ranges[0][0] end_trim = non_silent_ranges[-1][1] return audio_segment[start_trim:end_trim] # =============================== # MAIN TTS FUNCTION # =============================== def generate_tts( text, ref_audio=None, exaggeration=0.4, temperature=0.7, seed=0, cfg_weight=0.6, ): model = get_model() if seed != 0: set_seed(int(seed)) kwargs = { "exaggeration": exaggeration, "temperature": temperature, "cfg_weight": cfg_weight, } # -------------------------------- # Handle reference voice # -------------------------------- temp_prompt = None if ref_audio: try: audio = AudioSegment.from_file(ref_audio) temp_prompt = "voice_prompt.wav" audio.export(temp_prompt, format="wav") kwargs["audio_prompt_path"] = temp_prompt except: print("Reference audio failed — using default voice.") # -------------------------------- # Sentence chunking # -------------------------------- sentences = sent_tokenize(text) chunks = [] current = "" for s in sentences: if len(current) + len(s) < MAX_CHARS: current += " " + s else: chunks.append(current.strip()) current = s if current.strip(): chunks.append(current.strip()) print(f"Total chunks: {len(chunks)}") # -------------------------------- # Generate audio per chunk # -------------------------------- final_audio = AudioSegment.empty() clean_pause = AudioSegment.silent(duration=SILENCE_MS) for i, chunk in enumerate(chunks): print(f"Generating chunk {i+1}/{len(chunks)}") # 1. Generate Raw Audio wav = model.generate(chunk, **kwargs) wav_np = wav.squeeze(0).cpu().numpy() buffer = io.BytesIO() sf.write(buffer, wav_np, model.sr, format="WAV") buffer.seek(0) segment = AudioSegment.from_wav(buffer) # 2. TRIM ARTIFACTS (The Fix) # We strip the "trailing breath" or silence from the model output # BEFORE we add our own clean silence. segment = trim_audio_segment(segment, silence_thresh=-45) # 3. Apply light fade only after trimming if len(segment) > 0: segment = segment.fade_in(FADE_IN).fade_out(FADE_OUT) final_audio += segment + clean_pause # -------------------------------- # Export # -------------------------------- output_path = "story_voice.mp3" final_audio.export(output_path, format="mp3", bitrate="192k") if temp_prompt and os.path.exists(temp_prompt): os.remove(temp_prompt) return output_path # =============================== # GRADIO UI # =============================== with gr.Blocks() as demo: gr.Markdown("## 🎙️ Storyteller / Podcast Chatterbox TTS (Cleaned)") text = gr.Textbox( label="Story Text", lines=12, placeholder="Paste your full story here..." ) ref = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Reference Voice (optional)" ) exaggeration = gr.Slider(0.25, 1.0, value=0.4, step=0.05, label="Emotion") temperature = gr.Slider(0.3, 1.2, value=0.7, step=0.05, label="Variation") cfg = gr.Slider(0.3, 1.0, value=0.6, step=0.05, label="Voice Stability") seed = gr.Number(value=0, label="Seed (0 = random)") btn = gr.Button("Generate Voice") out = gr.Audio(label="Final Audio") btn.click( fn=generate_tts, inputs=[text, ref, exaggeration, temperature, seed, cfg], outputs=out ) demo.launch(share=True)