Update app.py
Browse files
app.py
CHANGED
|
@@ -143,6 +143,26 @@ def get_supported_languages_display() -> str:
|
|
| 143 |
{line2}
|
| 144 |
"""
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
DEVICE = "cpu"
|
| 148 |
MODEL = None
|
|
@@ -206,6 +226,8 @@ def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | N
|
|
| 206 |
|
| 207 |
def generate_tts_audio(
|
| 208 |
text_input: str,
|
|
|
|
|
|
|
| 209 |
language_id: str,
|
| 210 |
audio_prompt_path_input: str = None,
|
| 211 |
exaggeration_input: float = 0.5,
|
|
@@ -232,89 +254,109 @@ def generate_tts_audio(
|
|
| 232 |
if chosen_prompt:
|
| 233 |
generate_kwargs["audio_prompt_path"] = chosen_prompt
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
# 🔒 CPU-safe inference
|
| 236 |
with torch.no_grad():
|
| 237 |
wav = current_model.generate(
|
| 238 |
-
|
| 239 |
language_id=language_id,
|
| 240 |
**generate_kwargs
|
| 241 |
)
|
| 242 |
|
| 243 |
-
# Ensure CPU numpy conversion
|
| 244 |
wav = wav.squeeze(0).detach().cpu().numpy()
|
|
|
|
| 245 |
|
| 246 |
-
return (current_model.sr, wav)
|
| 247 |
|
| 248 |
with gr.Blocks() as demo:
|
| 249 |
gr.Markdown(
|
| 250 |
"""
|
| 251 |
# Chatterbox Multilingual Demo
|
| 252 |
-
Generate high-quality multilingual speech from text
|
| 253 |
-
|
| 254 |
-
For a hosted version of Chatterbox Multilingual and for finetuning, please visit [resemble.ai](https://app.resemble.ai)
|
| 255 |
"""
|
| 256 |
)
|
| 257 |
-
|
| 258 |
-
# Display supported languages
|
| 259 |
gr.Markdown(get_supported_languages_display())
|
|
|
|
| 260 |
with gr.Row():
|
| 261 |
with gr.Column():
|
| 262 |
initial_lang = "hi"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
text = gr.Textbox(
|
| 264 |
value=default_text_for_ui(initial_lang),
|
| 265 |
-
label="Text
|
| 266 |
-
max_lines=
|
| 267 |
)
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
language_id = gr.Dropdown(
|
| 270 |
choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
|
| 271 |
value=initial_lang,
|
| 272 |
-
label="Language"
|
| 273 |
-
info="Select the language for text-to-speech synthesis"
|
| 274 |
)
|
| 275 |
-
|
| 276 |
ref_wav = gr.Audio(
|
| 277 |
sources=["upload", "microphone"],
|
| 278 |
type="filepath",
|
| 279 |
-
label="Reference Audio
|
| 280 |
value=default_audio_for_ui(initial_lang)
|
| 281 |
)
|
| 282 |
-
|
| 283 |
-
gr.Markdown(
|
| 284 |
-
"💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
|
| 285 |
-
elem_classes=["audio-note"]
|
| 286 |
-
)
|
| 287 |
-
|
| 288 |
exaggeration = gr.Slider(
|
| 289 |
-
0.25, 2, step
|
|
|
|
|
|
|
| 290 |
)
|
|
|
|
| 291 |
cfg_weight = gr.Slider(
|
| 292 |
-
0.2, 1, step
|
|
|
|
|
|
|
| 293 |
)
|
| 294 |
|
| 295 |
with gr.Accordion("More options", open=False):
|
| 296 |
-
seed_num = gr.Number(value=0, label="Random seed (0
|
| 297 |
-
temp = gr.Slider(0.05, 5, step
|
| 298 |
|
| 299 |
run_btn = gr.Button("Generate", variant="primary")
|
| 300 |
|
| 301 |
with gr.Column():
|
| 302 |
audio_output = gr.Audio(label="Output Audio")
|
| 303 |
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
| 313 |
|
| 314 |
run_btn.click(
|
| 315 |
fn=generate_tts_audio,
|
| 316 |
inputs=[
|
| 317 |
text,
|
|
|
|
|
|
|
| 318 |
language_id,
|
| 319 |
ref_wav,
|
| 320 |
exaggeration,
|
|
@@ -326,3 +368,5 @@ with gr.Blocks() as demo:
|
|
| 326 |
)
|
| 327 |
|
| 328 |
demo.launch(mcp_server=True)
|
|
|
|
|
|
|
|
|
| 143 |
{line2}
|
| 144 |
"""
|
| 145 |
|
| 146 |
+
def format_for_singing(lyrics: str) -> str:
|
| 147 |
+
return f"""
|
| 148 |
+
You are a playful children's song singer.
|
| 149 |
+
Do NOT speak normally.
|
| 150 |
+
Perform this rhythmically and melodically like a song.
|
| 151 |
+
|
| 152 |
+
Rules:
|
| 153 |
+
- Stretch vowels
|
| 154 |
+
- Follow rhythm
|
| 155 |
+
- Pause between lines
|
| 156 |
+
- Raise pitch on questions
|
| 157 |
+
- Sound playful and musical
|
| 158 |
+
|
| 159 |
+
Start with a soft humming intro:
|
| 160 |
+
hmm-hmm-hmm ♪
|
| 161 |
+
|
| 162 |
+
Lyrics (sing line by line):
|
| 163 |
+
|
| 164 |
+
{lyrics}
|
| 165 |
+
"""
|
| 166 |
|
| 167 |
DEVICE = "cpu"
|
| 168 |
MODEL = None
|
|
|
|
| 226 |
|
| 227 |
def generate_tts_audio(
|
| 228 |
text_input: str,
|
| 229 |
+
lyrics_input: str,
|
| 230 |
+
mode: str,
|
| 231 |
language_id: str,
|
| 232 |
audio_prompt_path_input: str = None,
|
| 233 |
exaggeration_input: float = 0.5,
|
|
|
|
| 254 |
if chosen_prompt:
|
| 255 |
generate_kwargs["audio_prompt_path"] = chosen_prompt
|
| 256 |
|
| 257 |
+
# 🔀 Choose Speak vs Sing text
|
| 258 |
+
if mode == "Sing 🎵" and lyrics_input.strip():
|
| 259 |
+
final_text = format_for_singing(lyrics_input)
|
| 260 |
+
else:
|
| 261 |
+
final_text = text_input
|
| 262 |
+
|
| 263 |
# 🔒 CPU-safe inference
|
| 264 |
with torch.no_grad():
|
| 265 |
wav = current_model.generate(
|
| 266 |
+
final_text[:300],
|
| 267 |
language_id=language_id,
|
| 268 |
**generate_kwargs
|
| 269 |
)
|
| 270 |
|
|
|
|
| 271 |
wav = wav.squeeze(0).detach().cpu().numpy()
|
| 272 |
+
return current_model.sr, wav
|
| 273 |
|
|
|
|
| 274 |
|
| 275 |
with gr.Blocks() as demo:
|
| 276 |
gr.Markdown(
|
| 277 |
"""
|
| 278 |
# Chatterbox Multilingual Demo
|
| 279 |
+
Generate high-quality multilingual speech from text or lyrics (sing mode).
|
|
|
|
|
|
|
| 280 |
"""
|
| 281 |
)
|
| 282 |
+
|
|
|
|
| 283 |
gr.Markdown(get_supported_languages_display())
|
| 284 |
+
|
| 285 |
with gr.Row():
|
| 286 |
with gr.Column():
|
| 287 |
initial_lang = "hi"
|
| 288 |
+
|
| 289 |
+
mode = gr.Radio(
|
| 290 |
+
choices=["Speak 🗣️", "Sing 🎵"],
|
| 291 |
+
value="Speak 🗣️",
|
| 292 |
+
label="Output Mode"
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
text = gr.Textbox(
|
| 296 |
value=default_text_for_ui(initial_lang),
|
| 297 |
+
label="Text (Speak mode)",
|
| 298 |
+
max_lines=4
|
| 299 |
)
|
| 300 |
+
|
| 301 |
+
lyrics = gr.Textbox(
|
| 302 |
+
label="Lyrics (Sing mode)",
|
| 303 |
+
placeholder="Paste lyrics here (one line per verse)",
|
| 304 |
+
max_lines=10
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
language_id = gr.Dropdown(
|
| 308 |
choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
|
| 309 |
value=initial_lang,
|
| 310 |
+
label="Language"
|
|
|
|
| 311 |
)
|
| 312 |
+
|
| 313 |
ref_wav = gr.Audio(
|
| 314 |
sources=["upload", "microphone"],
|
| 315 |
type="filepath",
|
| 316 |
+
label="Reference Audio (Optional)",
|
| 317 |
value=default_audio_for_ui(initial_lang)
|
| 318 |
)
|
| 319 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
exaggeration = gr.Slider(
|
| 321 |
+
0.25, 2, step=0.05,
|
| 322 |
+
label="Exaggeration",
|
| 323 |
+
value=0.5
|
| 324 |
)
|
| 325 |
+
|
| 326 |
cfg_weight = gr.Slider(
|
| 327 |
+
0.2, 1, step=0.05,
|
| 328 |
+
label="CFG / Pace",
|
| 329 |
+
value=0.5
|
| 330 |
)
|
| 331 |
|
| 332 |
with gr.Accordion("More options", open=False):
|
| 333 |
+
seed_num = gr.Number(value=0, label="Random seed (0 = random)")
|
| 334 |
+
temp = gr.Slider(0.05, 5, step=0.05, label="Temperature", value=0.8)
|
| 335 |
|
| 336 |
run_btn = gr.Button("Generate", variant="primary")
|
| 337 |
|
| 338 |
with gr.Column():
|
| 339 |
audio_output = gr.Audio(label="Output Audio")
|
| 340 |
|
| 341 |
+
# 🎛️ Auto-tune sliders for Sing mode
|
| 342 |
+
def on_mode_change(mode):
|
| 343 |
+
if mode == "Sing 🎵":
|
| 344 |
+
return 1.25, 1.0, 0.45
|
| 345 |
+
return 0.5, 0.8, 0.5
|
| 346 |
+
|
| 347 |
+
mode.change(
|
| 348 |
+
fn=on_mode_change,
|
| 349 |
+
inputs=mode,
|
| 350 |
+
outputs=[exaggeration, temp, cfg_weight],
|
| 351 |
+
show_progress=False
|
| 352 |
+
)
|
| 353 |
|
| 354 |
run_btn.click(
|
| 355 |
fn=generate_tts_audio,
|
| 356 |
inputs=[
|
| 357 |
text,
|
| 358 |
+
lyrics,
|
| 359 |
+
mode,
|
| 360 |
language_id,
|
| 361 |
ref_wav,
|
| 362 |
exaggeration,
|
|
|
|
| 368 |
)
|
| 369 |
|
| 370 |
demo.launch(mcp_server=True)
|
| 371 |
+
|
| 372 |
+
|