Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,339 +1,236 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
import os
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import tempfile
|
| 5 |
-
from
|
| 6 |
-
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
engine = VoiceCloningEngine()
|
| 11 |
-
print("Engine ready!")
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
def
|
| 15 |
-
text: str,
|
| 16 |
-
reference_audio,
|
| 17 |
-
exaggeration: float,
|
| 18 |
-
cfg: float,
|
| 19 |
-
seed: int,
|
| 20 |
-
max_words_per_chunk: int,
|
| 21 |
-
use_seed: bool,
|
| 22 |
-
language: str
|
| 23 |
-
):
|
| 24 |
"""
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
Args:
|
| 28 |
-
text: Text to synthesize
|
| 29 |
-
reference_audio: Uploaded reference audio file
|
| 30 |
-
exaggeration: Emotion exaggeration (0.0-1.0+)
|
| 31 |
-
cfg: Classifier-Free Guidance weight (0.0-1.0)
|
| 32 |
-
seed: Random seed
|
| 33 |
-
max_words_per_chunk: Max words per chunk
|
| 34 |
-
use_seed: Whether to use the seed value
|
| 35 |
-
language: Language code for multilingual model
|
| 36 |
-
|
| 37 |
-
Returns:
|
| 38 |
-
Tuple of (audio_path, info_text)
|
| 39 |
"""
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
if isinstance(reference_audio, str):
|
| 50 |
-
ref_audio_path = reference_audio
|
| 51 |
else:
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
print(status)
|
| 74 |
-
|
| 75 |
-
# Generate audio
|
| 76 |
-
output_path = engine.generate_speech(
|
| 77 |
-
text=text,
|
| 78 |
-
reference_audio_path=ref_audio_path,
|
| 79 |
-
exaggeration=exaggeration,
|
| 80 |
-
cfg=cfg,
|
| 81 |
-
seed=actual_seed,
|
| 82 |
-
max_words_per_chunk=max_words_per_chunk,
|
| 83 |
-
language=language
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
# Get duration
|
| 87 |
-
duration = engine.get_audio_duration(output_path)
|
| 88 |
-
|
| 89 |
-
# Success message
|
| 90 |
-
success_msg = f"✅ **Generation Complete!**\n\n"
|
| 91 |
-
success_msg += f"📊 Audio Duration: {duration:.2f} seconds\n"
|
| 92 |
-
success_msg += f"📝 Words Synthesized: {word_count}\n"
|
| 93 |
-
success_msg += f"⚡ Speed: {word_count/duration:.1f} words/second\n"
|
| 94 |
-
success_msg += f"\n💧 *Audio includes Perth watermark for authentication*"
|
| 95 |
-
|
| 96 |
-
return output_path, success_msg
|
| 97 |
|
| 98 |
-
|
| 99 |
-
error_msg = f"❌ **Error during generation:**\n\n{str(e)}"
|
| 100 |
-
print(error_msg)
|
| 101 |
-
return None, error_msg
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
def update_seed_visibility(use_seed):
|
| 105 |
-
"""Toggle seed input visibility"""
|
| 106 |
-
return gr.update(visible=use_seed)
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
def
|
| 110 |
-
"""
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
return f"📦 Chunks: 1 (Text is within limit)"
|
| 117 |
-
else:
|
| 118 |
-
num_chunks = (word_count // max_words) + 1
|
| 119 |
-
return f"📦 Estimated Chunks: {num_chunks}"
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
# Create Gradio interface
|
| 123 |
-
with gr.Blocks(
|
| 124 |
-
title="🎙️ Chatterbox TTS Voice Cloning",
|
| 125 |
-
theme=gr.themes.Soft(
|
| 126 |
-
primary_hue="blue",
|
| 127 |
-
secondary_hue="slate",
|
| 128 |
-
)
|
| 129 |
-
) as app:
|
| 130 |
|
| 131 |
-
|
| 132 |
-
""
|
| 133 |
-
# 🎙️ Resemble AI Chatterbox Voice Cloning
|
| 134 |
-
|
| 135 |
-
Clone any voice by providing a reference audio sample! Powered by **Chatterbox Turbo** - the state-of-the-art, open-source TTS model.
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
- ⚡ **Auto-chunking**: Long texts automatically split for better quality
|
| 147 |
-
- 💧 **Perth Watermark**: All outputs include imperceptible authentication watermark
|
| 148 |
|
| 149 |
-
|
| 150 |
-
"""
|
| 151 |
-
)
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
gr.Markdown("### 🎵 Reference Audio")
|
| 157 |
-
reference_audio = gr.Audio(
|
| 158 |
-
label="Upload Reference Audio",
|
| 159 |
-
type="filepath",
|
| 160 |
-
sources=["upload", "microphone"]
|
| 161 |
-
)
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
)
|
| 168 |
|
| 169 |
-
#
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
label="Enter Text",
|
| 173 |
-
placeholder="Type or paste the text you want to convert to speech...",
|
| 174 |
-
lines=8,
|
| 175 |
-
max_lines=20
|
| 176 |
-
)
|
| 177 |
|
| 178 |
-
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
)
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
value=0.5,
|
| 198 |
-
step=0.1,
|
| 199 |
-
label="CFG (Classifier-Free Guidance)",
|
| 200 |
-
info="Lower values for faster speech, higher for more deliberate pacing"
|
| 201 |
)
|
| 202 |
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
choices=[
|
| 215 |
-
"en", "es", "fr", "de", "it", "pt", "ru", "zh",
|
| 216 |
-
"ja", "ko", "ar", "hi", "nl", "pl", "tr", "sv",
|
| 217 |
-
"no", "da", "fi", "el", "he", "ms", "sw"
|
| 218 |
-
],
|
| 219 |
-
value="en",
|
| 220 |
-
label="Language",
|
| 221 |
-
info="For multilingual model (English by default)"
|
| 222 |
-
)
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
# Event handlers
|
| 268 |
-
use_seed.change(
|
| 269 |
-
fn=update_seed_visibility,
|
| 270 |
-
inputs=use_seed,
|
| 271 |
-
outputs=seed_input
|
| 272 |
-
)
|
| 273 |
-
|
| 274 |
-
text_input.change(
|
| 275 |
-
fn=estimate_chunks,
|
| 276 |
-
inputs=[text_input, max_words_per_chunk],
|
| 277 |
-
outputs=chunk_estimate
|
| 278 |
-
)
|
| 279 |
-
|
| 280 |
-
max_words_per_chunk.change(
|
| 281 |
-
fn=estimate_chunks,
|
| 282 |
-
inputs=[text_input, max_words_per_chunk],
|
| 283 |
-
outputs=chunk_estimate
|
| 284 |
-
)
|
| 285 |
-
|
| 286 |
-
generate_btn.click(
|
| 287 |
-
fn=generate_voice,
|
| 288 |
-
inputs=[
|
| 289 |
-
text_input,
|
| 290 |
-
reference_audio,
|
| 291 |
-
exaggeration,
|
| 292 |
-
cfg,
|
| 293 |
-
seed_input,
|
| 294 |
-
max_words_per_chunk,
|
| 295 |
-
use_seed,
|
| 296 |
-
language
|
| 297 |
-
],
|
| 298 |
-
outputs=[output_audio, output_info]
|
| 299 |
-
)
|
| 300 |
-
|
| 301 |
-
gr.Markdown(
|
| 302 |
-
"""
|
| 303 |
-
---
|
| 304 |
-
### ℹ️ About Chatterbox
|
| 305 |
-
|
| 306 |
-
This app uses **Resemble AI's Chatterbox Turbo** - the fastest open-source TTS model. It automatically handles:
|
| 307 |
-
- ✅ Voice cloning with just 5-30 seconds of audio
|
| 308 |
-
- ✅ Text chunking for long inputs (auto-concatenation)
|
| 309 |
-
- ✅ Emotion exaggeration control (unique to Chatterbox)
|
| 310 |
-
- ✅ Paralinguistic tags: [laugh], [chuckle], [cough], [sigh]
|
| 311 |
-
- ✅ Perth watermarking for audio authentication
|
| 312 |
-
|
| 313 |
-
**Models Available**:
|
| 314 |
-
- 🚀 **Turbo**: Fastest, supports paralinguistic tags
|
| 315 |
-
- 🎯 **Standard**: High quality with emotion control
|
| 316 |
-
- 🌍 **Multilingual**: 23 languages supported
|
| 317 |
-
|
| 318 |
-
**Source**: [GitHub - Resemble AI Chatterbox](https://github.com/resemble-ai/chatterbox)
|
| 319 |
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
- WAV format at 24kHz+ recommended
|
| 323 |
-
- Single speaker, minimal background noise
|
| 324 |
-
- Try exaggeration=0.7+ for more expressive output
|
| 325 |
-
- Lower CFG (~0.3) for faster speaking pace
|
| 326 |
-
- Use paralinguistic tags like [chuckle] for reactions
|
| 327 |
|
| 328 |
-
|
| 329 |
-
"""
|
| 330 |
-
)
|
| 331 |
-
|
| 332 |
|
| 333 |
-
# Launch the app
|
| 334 |
if __name__ == "__main__":
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
share=False
|
| 339 |
-
)
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import random
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
import torchaudio
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import re
|
| 8 |
import tempfile
|
| 9 |
+
from chatterbox.tts import ChatterboxTTS
|
|
|
|
| 10 |
|
| 11 |
+
# Set device
|
| 12 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
def set_seed(seed: int):
|
| 15 |
+
"""Set random seed for reproducibility."""
|
| 16 |
+
if seed == 0:
|
| 17 |
+
seed = random.randint(1, 1000000)
|
| 18 |
+
torch.manual_seed(seed)
|
| 19 |
+
torch.cuda.manual_seed(seed)
|
| 20 |
+
torch.cuda.manual_seed_all(seed)
|
| 21 |
+
random.seed(seed)
|
| 22 |
+
np.random.seed(seed)
|
| 23 |
+
return seed
|
| 24 |
|
| 25 |
+
def split_text(text, max_chars=250):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"""
|
| 27 |
+
Intelligent text chunking with sentence boundary detection.
|
| 28 |
+
Splits text into chunks of approximately max_chars, trying to stay on sentence boundaries.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
+
# Simple sentence boundary detection using regex
|
| 31 |
+
# Split by periods, question marks, and exclamation marks followed by whitespace
|
| 32 |
+
sentences = re.split(r'(?<=[.!?])\s+', text.strip())
|
| 33 |
+
chunks = []
|
| 34 |
+
current_chunk = ""
|
| 35 |
+
|
| 36 |
+
for sentence in sentences:
|
| 37 |
+
if len(current_chunk) + len(sentence) <= max_chars:
|
| 38 |
+
current_chunk += (sentence + " ")
|
|
|
|
|
|
|
| 39 |
else:
|
| 40 |
+
if current_chunk:
|
| 41 |
+
chunks.append(current_chunk.strip())
|
| 42 |
+
# If a single sentence is longer than max_chars, we have to split it
|
| 43 |
+
if len(sentence) > max_chars:
|
| 44 |
+
# Further split long sentences by commas or spaces as fallback
|
| 45 |
+
sub_parts = re.split(r'(?<=,)\s+|\s+', sentence)
|
| 46 |
+
temp_chunk = ""
|
| 47 |
+
for part in sub_parts:
|
| 48 |
+
if len(temp_chunk) + len(part) <= max_chars:
|
| 49 |
+
temp_chunk += (part + " ")
|
| 50 |
+
else:
|
| 51 |
+
if temp_chunk:
|
| 52 |
+
chunks.append(temp_chunk.strip())
|
| 53 |
+
temp_chunk = part + " "
|
| 54 |
+
current_chunk = temp_chunk
|
| 55 |
+
else:
|
| 56 |
+
current_chunk = sentence + " "
|
| 57 |
+
|
| 58 |
+
if current_chunk:
|
| 59 |
+
chunks.append(current_chunk.strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
def load_model():
|
| 64 |
+
"""Load the Chatterbox TTS model."""
|
| 65 |
+
try:
|
| 66 |
+
print(f"Loading Chatterbox TTS model on {DEVICE}...")
|
| 67 |
+
model = ChatterboxTTS.from_pretrained(DEVICE)
|
| 68 |
+
return model
|
| 69 |
+
except Exception as e:
|
| 70 |
+
print(f"Error loading model: {e}")
|
| 71 |
+
return None
|
| 72 |
|
| 73 |
+
def generate_tts(model, text, ref_audio, exaggeration, cfg_weight, temperature, seed, progress=gr.Progress()):
|
| 74 |
+
"""
|
| 75 |
+
Generate TTS audio from text, handling long scripts via chunking.
|
| 76 |
+
"""
|
| 77 |
+
if model is None:
|
| 78 |
+
# Try to load if not already loaded (for HF Spaces persistence)
|
| 79 |
+
model = load_model()
|
| 80 |
+
if model is None:
|
| 81 |
+
return None, "Error: Model could not be loaded. Check your environment/GPU."
|
| 82 |
|
| 83 |
+
if not text.strip():
|
| 84 |
+
return None, "Error: Please enter some text."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
if ref_audio is None:
|
| 87 |
+
return None, "Error: Please upload a reference audio file for voice cloning."
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
# Set seed
|
| 90 |
+
actual_seed = set_seed(int(seed))
|
| 91 |
+
|
| 92 |
+
# Chunk the text
|
| 93 |
+
chunks = split_text(text)
|
| 94 |
+
total_chunks = len(chunks)
|
| 95 |
+
|
| 96 |
+
if total_chunks == 0:
|
| 97 |
+
return None, "Error: No valid text to process."
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
all_wavs = []
|
|
|
|
|
|
|
| 100 |
|
| 101 |
+
try:
|
| 102 |
+
for i, chunk in enumerate(chunks):
|
| 103 |
+
progress((i / total_chunks), desc=f"Processing chunk {i+1}/{total_chunks}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
# Generate audio for this chunk
|
| 106 |
+
# Chatterbox.generate expects: text, audio_prompt_path, exaggeration, temperature, cfg_weight, etc.
|
| 107 |
+
wav = model.generate(
|
| 108 |
+
chunk,
|
| 109 |
+
audio_prompt_path=ref_audio,
|
| 110 |
+
exaggeration=exaggeration,
|
| 111 |
+
temperature=temperature,
|
| 112 |
+
cfg_weight=cfg_weight
|
| 113 |
)
|
| 114 |
|
| 115 |
+
# wav is usually a torch tensor [1, T] or [T]
|
| 116 |
+
if wav.dim() == 1:
|
| 117 |
+
wav = wav.unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
all_wavs.append(wav.cpu())
|
| 120 |
|
| 121 |
+
# Concatenate all audio chunks along the time dimension (last dim)
|
| 122 |
+
if not all_wavs:
|
| 123 |
+
return None, "Error: No audio was generated."
|
| 124 |
|
| 125 |
+
final_wav = torch.cat(all_wavs, dim=-1)
|
| 126 |
+
|
| 127 |
+
# Save to a temporary file
|
| 128 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
|
| 129 |
+
output_path = tmp_file.name
|
| 130 |
+
torchaudio.save(output_path, final_wav, model.sr)
|
| 131 |
+
|
| 132 |
+
return output_path, f"Successfully generated audio with seed {actual_seed}. Total chunks: {total_chunks}."
|
| 133 |
+
|
| 134 |
+
except Exception as e:
|
| 135 |
+
import traceback
|
| 136 |
+
traceback.print_exc()
|
| 137 |
+
return None, f"Error during generation: {str(e)}"
|
| 138 |
+
|
| 139 |
+
# Define the Gradio Interface
|
| 140 |
+
def create_ui():
|
| 141 |
+
# Model is loaded once and stored in state
|
| 142 |
+
model_state = gr.State(None)
|
| 143 |
+
|
| 144 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Chatterbox Voice Clone TTS") as demo:
|
| 145 |
+
gr.Markdown("# 🗣️ Voice Cloning TTS Chatterbox")
|
| 146 |
+
gr.Markdown("""
|
| 147 |
+
Clone any voice using a short reference audio clip. This application is optimized for long scripts
|
| 148 |
+
through intelligent sentence-based chunking and sequential processing.
|
| 149 |
+
""")
|
| 150 |
+
|
| 151 |
+
with gr.Row():
|
| 152 |
+
with gr.Column(scale=1):
|
| 153 |
+
text_input = gr.Textbox(
|
| 154 |
+
label="Script",
|
| 155 |
+
placeholder="Enter your long script here. The app will automatically handle chunking...",
|
| 156 |
+
lines=10,
|
| 157 |
+
value="Welcome to the Chatterbox voice cloning application. This tool allows you to generate high-quality speech from long scripts by automatically splitting them into manageable segments. Simply upload a reference audio clip of the voice you want to clone, and adjust the parameters to your liking."
|
| 158 |
)
|
| 159 |
+
ref_audio = gr.Audio(
|
| 160 |
+
label="Reference Audio (Voice to Clone)",
|
| 161 |
+
type="filepath",
|
| 162 |
+
sources=["upload", "microphone"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
)
|
| 164 |
|
| 165 |
+
with gr.Row():
|
| 166 |
+
exaggeration = gr.Slider(
|
| 167 |
+
0.1, 1.0, value=0.5, step=0.05,
|
| 168 |
+
label="Exaggeration",
|
| 169 |
+
info="Default 0.5. Extreme values (>0.8) may be unstable."
|
| 170 |
+
)
|
| 171 |
+
cfg_weight = gr.Slider(
|
| 172 |
+
0.0, 1.0, value=0.5, step=0.05,
|
| 173 |
+
label="CFG/Pace",
|
| 174 |
+
info="Control the pace and guidance scale."
|
| 175 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
+
with gr.Accordion("Advanced Options", open=False):
|
| 178 |
+
seed = gr.Number(
|
| 179 |
+
label="Seed",
|
| 180 |
+
value=0,
|
| 181 |
+
precision=0,
|
| 182 |
+
info="Set to 0 for random seed each time."
|
| 183 |
+
)
|
| 184 |
+
temperature = gr.Slider(
|
| 185 |
+
0.1, 2.0, value=1.0, step=0.05,
|
| 186 |
+
label="Temperature",
|
| 187 |
+
info="Higher values increase randomness and expressiveness."
|
| 188 |
+
)
|
| 189 |
|
| 190 |
+
generate_btn = gr.Button("Generate Audio", variant="primary")
|
| 191 |
+
|
| 192 |
+
with gr.Column(scale=1):
|
| 193 |
+
audio_output = gr.Audio(label="Generated Speech", type="filepath")
|
| 194 |
+
status_msg = gr.Textbox(label="Status", interactive=False)
|
| 195 |
+
|
| 196 |
+
gr.Markdown("### 📖 Documentation")
|
| 197 |
+
gr.Markdown("""
|
| 198 |
+
### Features
|
| 199 |
+
- **Voice Cloning**: Provide a clear 5-10 second reference clip.
|
| 200 |
+
- **Intelligent Chunking**: Scripts are split at sentence boundaries (approx. 250 chars) to ensure smooth transitions and avoid memory issues.
|
| 201 |
+
- **Sequential Processing**: Audio chunks are generated one-by-one and concatenated for long-form content.
|
| 202 |
+
- **Parameter Control**:
|
| 203 |
+
- **Exaggeration**: Intensity of cloned voice traits.
|
| 204 |
+
- **CFG/Pace**: Balance between text adherence and reference voice speed.
|
| 205 |
+
- **Temperature**: Randomness of the output.
|
| 206 |
+
|
| 207 |
+
### Tips
|
| 208 |
+
- Use a high-quality, noise-free reference audio for best results.
|
| 209 |
+
- For dramatic speech, try higher **Exaggeration** and lower **CFG**.
|
| 210 |
+
- If the output sounds unnatural, try a different **Seed** or adjust **Temperature**.
|
| 211 |
+
""")
|
| 212 |
+
|
| 213 |
+
# Event handling
|
| 214 |
+
generate_btn.click(
|
| 215 |
+
fn=generate_tts,
|
| 216 |
+
inputs=[
|
| 217 |
+
model_state,
|
| 218 |
+
text_input,
|
| 219 |
+
ref_audio,
|
| 220 |
+
exaggeration,
|
| 221 |
+
cfg_weight,
|
| 222 |
+
temperature,
|
| 223 |
+
seed
|
| 224 |
+
],
|
| 225 |
+
outputs=[audio_output, status_msg]
|
| 226 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
+
# Load model on startup
|
| 229 |
+
demo.load(fn=load_model, outputs=model_state)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
+
return demo
|
|
|
|
|
|
|
|
|
|
| 232 |
|
|
|
|
| 233 |
if __name__ == "__main__":
|
| 234 |
+
ui = create_ui()
|
| 235 |
+
# Use server_name="0.0.0.0" for deployment compatibility
|
| 236 |
+
ui.launch(server_name="0.0.0.0")
|
|
|
|
|
|