Spaces:
Sleeping
Sleeping
File size: 6,223 Bytes
9d593b2 0b3e025 0a2ade0 6fc220a 9d593b2 0b3e025 6fc220a 0b3e025 9386371 0b3e025 6fc220a 0b3e025 6fc220a 0b3e025 9386371 0b3e025 5329297 6fc220a 9d593b2 0b3e025 3dab9c0 9d593b2 0a2ade0 5329297 0a2ade0 5329297 0a2ade0 5329297 0a2ade0 6fc220a 5329297 6fc220a 5329297 6fc220a 9386371 af25078 1afd111 6fc220a 0b3e025 6fc220a 0b3e025 9386371 0b3e025 6fc220a 5329297 6fc220a 5329297 6fc220a 1afd111 af25078 1afd111 af25078 1afd111 6fc220a 5329297 6fc220a 0a2ade0 6fc220a 5329297 6fc220a 5329297 3dab9c0 6fc220a 5329297 0a2ade0 6fc220a 5329297 6fc220a 9d593b2 9386371 6fc220a 0a2ade0 9386371 6fc220a 9d593b2 6fc220a 9386371 6fc220a 9386371 6fc220a 9386371 bf4bbc3 9386371 58ffee2 6fc220a 9d593b2 6fc220a 9386371 1afd111 9d593b2 6fc220a 0a2ade0 6fc220a 0a2ade0 6fc220a 0a2ade0 6fc220a 9d593b2 5329297 9d593b2 9386371 9d593b2 58ffee2 1afd111 6fc220a 0a2ade0 9d593b2 3b42160 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
import random
import numpy as np
import torch
import gradio as gr
import spaces
import re
from chatterbox.src.chatterbox.tts import ChatterboxTTS
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Running on device: {DEVICE}")
# ---------------------------------------
# GLOBAL MODEL LOAD
# ---------------------------------------
MODEL = None
def get_or_load_model():
global MODEL
if MODEL is None:
print("Model not loaded, initializing...")
try:
MODEL = ChatterboxTTS.from_pretrained(DEVICE)
if hasattr(MODEL, "to") and str(MODEL.device) != DEVICE:
MODEL.to(DEVICE)
print("Model loaded successfully.")
except Exception as e:
print(f"Error loading model: {e}")
raise
return MODEL
try:
get_or_load_model()
except Exception as e:
print(f"CRITICAL startup load failed: {e}")
# ---------------------------------------
# UTILITIES
# ---------------------------------------
def set_seed(seed: int):
torch.manual_seed(seed)
if DEVICE == "cuda":
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
# --- SMART CHUNKING ---
def smart_chunk_text(text: str, chunk_size: int):
sentences = re.split(r"(?<=[\.\!\?…;])\s+", text)
chunks = []
current = ""
for sentence in sentences:
if len(current) + len(sentence) > chunk_size:
if current:
chunks.append(current.strip())
current = sentence + " "
else:
current += sentence + " "
if current:
chunks.append(current.strip())
return chunks
def concat_audio(chunks):
if not chunks:
return None
return np.concatenate(chunks, axis=-1)
# ---------------------------------------
# MAIN TTS FUNCTION
# ---------------------------------------
@spaces.GPU
def generate_tts_audio(
text_input: str,
audio_prompt_path_input: str = None,
exaggeration_input: float = 0.5,
temperature_input: float = 0.8,
seed_num_input: int = 0,
cfgw_input: float = 0.5,
vad_trim_input: bool = False,
enable_chunking: bool = False,
chunk_size_value: int = 250,
):
current_model = get_or_load_model()
if current_model is None:
raise RuntimeError("TTS model is not loaded.")
# -------------------------
# SEED HANDLING
# -------------------------
if seed_num_input == 0:
used_seed = random.randint(1, 2**31 - 1)
else:
used_seed = int(seed_num_input)
print(f"Using seed: {used_seed}")
set_seed(used_seed)
print(f"Generating audio for text (preview): '{text_input[:50]}...'")
generate_kwargs = {
"exaggeration": exaggeration_input,
"temperature": temperature_input,
"cfg_weight": cfgw_input,
"vad_trim": vad_trim_input,
}
if audio_prompt_path_input:
generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
# -------------------------
# SMART CHUNK PROCESSING
# -------------------------
if enable_chunking:
print(f"Smart chunking enabled — chunk size = {chunk_size_value}")
text_chunks = smart_chunk_text(text_input, int(chunk_size_value))
else:
text_chunks = [text_input]
audio_segments = []
for i, chunk in enumerate(text_chunks):
print(f"Rendering chunk {i+1}/{len(text_chunks)}...")
wav = current_model.generate(chunk, **generate_kwargs)
audio_segments.append(wav.squeeze(0).numpy())
final_audio = concat_audio(audio_segments)
print("Audio generation complete.")
# FIXED OUTPUT FORMAT (Gradio-compatible)
return (current_model.sr, final_audio), used_seed
# ---------------------------------------
# UI
# ---------------------------------------
with gr.Blocks() as demo:
gr.Markdown(
"""
# Chatterbox TTS Demo — Enhanced Version
Supports unlimited text, smart chunking & random seed viewer.
"""
)
with gr.Row():
with gr.Column():
text = gr.Textbox(
value="Now let's make my mum's favourite...",
label="Text to synthesize",
max_lines=10
)
ref_wav = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Reference Audio File (Optional)",
value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
)
exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration", value=.5)
cfg_weight = gr.Slider(0.2, 1, step=.05, label="CFG/Pace", value=0.5)
with gr.Accordion("More options", open=False):
seed_num = gr.Number(value=0, label="Random seed (0 = random)")
seed_display = gr.Textbox(
value="",
label="Seed Used (auto-filled)",
interactive=False
)
temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
vad_trim = gr.Checkbox(label="Ref VAD trimming", value=False)
enable_chunking = gr.Checkbox(
label="Enable Smart Text Chunking",
value=False
)
chunk_size = gr.Slider(
minimum=100,
maximum=2000,
value=250,
step=10,
label="Chunk Size (characters)"
)
run_btn = gr.Button("Generate", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Output Audio")
# CONNECT BUTTON
run_btn.click(
fn=generate_tts_audio,
inputs=[
text,
ref_wav,
exaggeration,
temp,
seed_num,
cfg_weight,
vad_trim,
enable_chunking,
chunk_size,
],
outputs=[
audio_output,
seed_display,
],
)
demo.launch(mcp_server=True, share=True)
|