Spaces:
Runtime error
Runtime error
File size: 5,807 Bytes
fb861cb 8196470 fb861cb fe83d6b 9d593b2 fb861cb 8196470 fb861cb 7f27076 8196470 9d593b2 8196470 ecc440e 8196470 0b3e025 8196470 0b3e025 8196470 0b3e025 8196470 0b3e025 8196470 9d593b2 8196470 9d593b2 ecc440e 9d593b2 8196470 7f27076 8196470 af25078 fb861cb 8196470 c2c369a 8196470 c2c369a 8196470 c2c369a 8196470 7f27076 8196470 7f27076 8196470 fb861cb 8196470 fb861cb c2c369a 8196470 fb861cb 7f27076 9d593b2 8196470 c2c369a 8196470 c2c369a 8196470 c2c369a 8196470 c2c369a 7f27076 8196470 c2c369a 8196470 9d593b2 fb861cb 7f27076 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import nltk
nltk.download("punkt")
import random
import numpy as np
import torch
import io
import os
import soundfile as sf
from nltk.tokenize import sent_tokenize
from pydub import AudioSegment, silence # Added silence module
import gradio as gr
from chatterbox.src.chatterbox.tts import ChatterboxTTS
# ===============================
# DEVICE
# ===============================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {DEVICE}")
# ===============================
# LOAD MODEL ONCE
# ===============================
MODEL = None
def get_model():
global MODEL
if MODEL is None:
print("Loading Chatterbox model...")
MODEL = ChatterboxTTS.from_pretrained(DEVICE)
if hasattr(MODEL, "to"):
MODEL.to(DEVICE)
print("Model ready.")
return MODEL
get_model()
# ===============================
# SEED
# ===============================
def set_seed(seed):
torch.manual_seed(seed)
if DEVICE == "cuda":
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
# ===============================
# PODCAST SAFE SETTINGS
# ===============================
MAX_CHARS = 220
SILENCE_MS = 250 # Reduced slightly since we are cleaning audio
FADE_IN = 10 # Reduced fade to avoid eating words
FADE_OUT = 10 # Reduced fade to avoid weird half-breath sounds
# ===============================
# HELPER: TRIM SILENCE/BREATHS
# ===============================
def trim_audio_segment(audio_segment, silence_thresh=-40):
"""
Trims silence or quiet breath sounds from the start and end of a chunk.
Adjust silence_thresh (dBFS) if it cuts off actual words.
"""
# Detect non-silent chunks
non_silent_ranges = silence.detect_nonsilent(
audio_segment,
min_silence_len=100,
silence_thresh=silence_thresh
)
# If audio is completely silent or empty, return empty
if not non_silent_ranges:
return AudioSegment.empty()
# Get start of first sound and end of last sound
start_trim = non_silent_ranges[0][0]
end_trim = non_silent_ranges[-1][1]
return audio_segment[start_trim:end_trim]
# ===============================
# MAIN TTS FUNCTION
# ===============================
def generate_tts(
text,
ref_audio=None,
exaggeration=0.4,
temperature=0.7,
seed=0,
cfg_weight=0.6,
):
model = get_model()
if seed != 0:
set_seed(int(seed))
kwargs = {
"exaggeration": exaggeration,
"temperature": temperature,
"cfg_weight": cfg_weight,
}
# --------------------------------
# Handle reference voice
# --------------------------------
temp_prompt = None
if ref_audio:
try:
audio = AudioSegment.from_file(ref_audio)
temp_prompt = "voice_prompt.wav"
audio.export(temp_prompt, format="wav")
kwargs["audio_prompt_path"] = temp_prompt
except:
print("Reference audio failed — using default voice.")
# --------------------------------
# Sentence chunking
# --------------------------------
sentences = sent_tokenize(text)
chunks = []
current = ""
for s in sentences:
if len(current) + len(s) < MAX_CHARS:
current += " " + s
else:
chunks.append(current.strip())
current = s
if current.strip():
chunks.append(current.strip())
print(f"Total chunks: {len(chunks)}")
# --------------------------------
# Generate audio per chunk
# --------------------------------
final_audio = AudioSegment.empty()
clean_pause = AudioSegment.silent(duration=SILENCE_MS)
for i, chunk in enumerate(chunks):
print(f"Generating chunk {i+1}/{len(chunks)}")
# 1. Generate Raw Audio
wav = model.generate(chunk, **kwargs)
wav_np = wav.squeeze(0).cpu().numpy()
buffer = io.BytesIO()
sf.write(buffer, wav_np, model.sr, format="WAV")
buffer.seek(0)
segment = AudioSegment.from_wav(buffer)
# 2. TRIM ARTIFACTS (The Fix)
# We strip the "trailing breath" or silence from the model output
# BEFORE we add our own clean silence.
segment = trim_audio_segment(segment, silence_thresh=-45)
# 3. Apply light fade only after trimming
if len(segment) > 0:
segment = segment.fade_in(FADE_IN).fade_out(FADE_OUT)
final_audio += segment + clean_pause
# --------------------------------
# Export
# --------------------------------
output_path = "story_voice.mp3"
final_audio.export(output_path, format="mp3", bitrate="192k")
if temp_prompt and os.path.exists(temp_prompt):
os.remove(temp_prompt)
return output_path
# ===============================
# GRADIO UI
# ===============================
with gr.Blocks() as demo:
gr.Markdown("## 🎙️ Storyteller / Podcast Chatterbox TTS (Cleaned)")
text = gr.Textbox(
label="Story Text",
lines=12,
placeholder="Paste your full story here..."
)
ref = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Reference Voice (optional)"
)
exaggeration = gr.Slider(0.25, 1.0, value=0.4, step=0.05, label="Emotion")
temperature = gr.Slider(0.3, 1.2, value=0.7, step=0.05, label="Variation")
cfg = gr.Slider(0.3, 1.0, value=0.6, step=0.05, label="Voice Stability")
seed = gr.Number(value=0, label="Seed (0 = random)")
btn = gr.Button("Generate Voice")
out = gr.Audio(label="Final Audio")
btn.click(
fn=generate_tts,
inputs=[text, ref, exaggeration, temperature, seed, cfg],
outputs=out
)
demo.launch(share=True) |