Chatterbox

Runtime error

App Files Files Community

Chatterbox / app.py

peterlllmm

Update app.py

7f27076 verified 3 days ago

raw

history blame contribute delete

5.81 kB

	import nltk
	nltk.download("punkt")

	import random
	import numpy as np
	import torch
	import io
	import os
	import soundfile as sf
	from nltk.tokenize import sent_tokenize
	from pydub import AudioSegment, silence # Added silence module
	import gradio as gr

	from chatterbox.src.chatterbox.tts import ChatterboxTTS

	# ===============================
	# DEVICE
	# ===============================
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Running on: {DEVICE}")

	# ===============================
	# LOAD MODEL ONCE
	# ===============================
	MODEL = None

	def get_model():
	global MODEL
	if MODEL is None:
	print("Loading Chatterbox model...")
	MODEL = ChatterboxTTS.from_pretrained(DEVICE)
	if hasattr(MODEL, "to"):
	MODEL.to(DEVICE)
	print("Model ready.")
	return MODEL

	get_model()

	# ===============================
	# SEED
	# ===============================
	def set_seed(seed):
	torch.manual_seed(seed)
	if DEVICE == "cuda":
	torch.cuda.manual_seed_all(seed)
	random.seed(seed)
	np.random.seed(seed)

	# ===============================
	# PODCAST SAFE SETTINGS
	# ===============================
	MAX_CHARS = 220
	SILENCE_MS = 250 # Reduced slightly since we are cleaning audio
	FADE_IN = 10 # Reduced fade to avoid eating words
	FADE_OUT = 10 # Reduced fade to avoid weird half-breath sounds

	# ===============================
	# HELPER: TRIM SILENCE/BREATHS
	# ===============================
	def trim_audio_segment(audio_segment, silence_thresh=-40):
	"""
	Trims silence or quiet breath sounds from the start and end of a chunk.
	Adjust silence_thresh (dBFS) if it cuts off actual words.
	"""
	# Detect non-silent chunks
	non_silent_ranges = silence.detect_nonsilent(
	audio_segment,
	min_silence_len=100,
	silence_thresh=silence_thresh
	)

	# If audio is completely silent or empty, return empty
	if not non_silent_ranges:
	return AudioSegment.empty()

	# Get start of first sound and end of last sound
	start_trim = non_silent_ranges[0][0]
	end_trim = non_silent_ranges[-1][1]

	return audio_segment[start_trim:end_trim]

	# ===============================
	# MAIN TTS FUNCTION
	# ===============================
	def generate_tts(
	text,
	ref_audio=None,
	exaggeration=0.4,
	temperature=0.7,
	seed=0,
	cfg_weight=0.6,
	):

	model = get_model()

	if seed != 0:
	set_seed(int(seed))

	kwargs = {
	"exaggeration": exaggeration,
	"temperature": temperature,
	"cfg_weight": cfg_weight,
	}

	# --------------------------------
	# Handle reference voice
	# --------------------------------
	temp_prompt = None
	if ref_audio:
	try:
	audio = AudioSegment.from_file(ref_audio)
	temp_prompt = "voice_prompt.wav"
	audio.export(temp_prompt, format="wav")
	kwargs["audio_prompt_path"] = temp_prompt
	except:
	print("Reference audio failed — using default voice.")

	# --------------------------------
	# Sentence chunking
	# --------------------------------
	sentences = sent_tokenize(text)

	chunks = []
	current = ""

	for s in sentences:
	if len(current) + len(s) < MAX_CHARS:
	current += " " + s
	else:
	chunks.append(current.strip())
	current = s

	if current.strip():
	chunks.append(current.strip())

	print(f"Total chunks: {len(chunks)}")

	# --------------------------------
	# Generate audio per chunk
	# --------------------------------
	final_audio = AudioSegment.empty()
	clean_pause = AudioSegment.silent(duration=SILENCE_MS)

	for i, chunk in enumerate(chunks):
	print(f"Generating chunk {i+1}/{len(chunks)}")

	# 1. Generate Raw Audio
	wav = model.generate(chunk, **kwargs)
	wav_np = wav.squeeze(0).cpu().numpy()

	buffer = io.BytesIO()
	sf.write(buffer, wav_np, model.sr, format="WAV")
	buffer.seek(0)

	segment = AudioSegment.from_wav(buffer)

	# 2. TRIM ARTIFACTS (The Fix)
	# We strip the "trailing breath" or silence from the model output
	# BEFORE we add our own clean silence.
	segment = trim_audio_segment(segment, silence_thresh=-45)

	# 3. Apply light fade only after trimming
	if len(segment) > 0:
	segment = segment.fade_in(FADE_IN).fade_out(FADE_OUT)
	final_audio += segment + clean_pause

	# --------------------------------
	# Export
	# --------------------------------
	output_path = "story_voice.mp3"
	final_audio.export(output_path, format="mp3", bitrate="192k")

	if temp_prompt and os.path.exists(temp_prompt):
	os.remove(temp_prompt)

	return output_path

	# ===============================
	# GRADIO UI
	# ===============================
	with gr.Blocks() as demo:
	gr.Markdown("## 🎙️ Storyteller / Podcast Chatterbox TTS (Cleaned)")

	text = gr.Textbox(
	label="Story Text",
	lines=12,
	placeholder="Paste your full story here..."
	)

	ref = gr.Audio(
	sources=["upload", "microphone"],
	type="filepath",
	label="Reference Voice (optional)"
	)

	exaggeration = gr.Slider(0.25, 1.0, value=0.4, step=0.05, label="Emotion")
	temperature = gr.Slider(0.3, 1.2, value=0.7, step=0.05, label="Variation")
	cfg = gr.Slider(0.3, 1.0, value=0.6, step=0.05, label="Voice Stability")

	seed = gr.Number(value=0, label="Seed (0 = random)")

	btn = gr.Button("Generate Voice")
	out = gr.Audio(label="Final Audio")

	btn.click(
	fn=generate_tts,
	inputs=[text, ref, exaggeration, temperature, seed, cfg],
	outputs=out
	)

	demo.launch(share=True)