rahul7star commited on
Commit
1526f1e
·
verified ·
1 Parent(s): d651e33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -34
app.py CHANGED
@@ -143,6 +143,26 @@ def get_supported_languages_display() -> str:
143
  {line2}
144
  """
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  DEVICE = "cpu"
148
  MODEL = None
@@ -206,6 +226,8 @@ def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | N
206
 
207
  def generate_tts_audio(
208
  text_input: str,
 
 
209
  language_id: str,
210
  audio_prompt_path_input: str = None,
211
  exaggeration_input: float = 0.5,
@@ -232,89 +254,109 @@ def generate_tts_audio(
232
  if chosen_prompt:
233
  generate_kwargs["audio_prompt_path"] = chosen_prompt
234
 
 
 
 
 
 
 
235
  # 🔒 CPU-safe inference
236
  with torch.no_grad():
237
  wav = current_model.generate(
238
- text_input[:300],
239
  language_id=language_id,
240
  **generate_kwargs
241
  )
242
 
243
- # Ensure CPU numpy conversion
244
  wav = wav.squeeze(0).detach().cpu().numpy()
 
245
 
246
- return (current_model.sr, wav)
247
 
248
  with gr.Blocks() as demo:
249
  gr.Markdown(
250
  """
251
  # Chatterbox Multilingual Demo
252
- Generate high-quality multilingual speech from text with reference audio styling, supporting 23 languages.
253
-
254
- For a hosted version of Chatterbox Multilingual and for finetuning, please visit [resemble.ai](https://app.resemble.ai)
255
  """
256
  )
257
-
258
- # Display supported languages
259
  gr.Markdown(get_supported_languages_display())
 
260
  with gr.Row():
261
  with gr.Column():
262
  initial_lang = "hi"
 
 
 
 
 
 
 
263
  text = gr.Textbox(
264
  value=default_text_for_ui(initial_lang),
265
- label="Text to synthesize (max chars 300)",
266
- max_lines=5
267
  )
268
-
 
 
 
 
 
 
269
  language_id = gr.Dropdown(
270
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
271
  value=initial_lang,
272
- label="Language",
273
- info="Select the language for text-to-speech synthesis"
274
  )
275
-
276
  ref_wav = gr.Audio(
277
  sources=["upload", "microphone"],
278
  type="filepath",
279
- label="Reference Audio File (Optional)",
280
  value=default_audio_for_ui(initial_lang)
281
  )
282
-
283
- gr.Markdown(
284
- "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
285
- elem_classes=["audio-note"]
286
- )
287
-
288
  exaggeration = gr.Slider(
289
- 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
 
 
290
  )
 
291
  cfg_weight = gr.Slider(
292
- 0.2, 1, step=.05, label="CFG/Pace", value=0.5
 
 
293
  )
294
 
295
  with gr.Accordion("More options", open=False):
296
- seed_num = gr.Number(value=0, label="Random seed (0 for random)")
297
- temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
298
 
299
  run_btn = gr.Button("Generate", variant="primary")
300
 
301
  with gr.Column():
302
  audio_output = gr.Audio(label="Output Audio")
303
 
304
- def on_language_change(lang, current_ref, current_text):
305
- return default_audio_for_ui(lang), default_text_for_ui(lang)
306
-
307
- language_id.change(
308
- fn=on_language_change,
309
- inputs=[language_id, ref_wav, text],
310
- outputs=[ref_wav, text],
311
- show_progress=False
312
- )
 
 
 
313
 
314
  run_btn.click(
315
  fn=generate_tts_audio,
316
  inputs=[
317
  text,
 
 
318
  language_id,
319
  ref_wav,
320
  exaggeration,
@@ -326,3 +368,5 @@ with gr.Blocks() as demo:
326
  )
327
 
328
  demo.launch(mcp_server=True)
 
 
 
143
  {line2}
144
  """
145
 
146
+ def format_for_singing(lyrics: str) -> str:
147
+ return f"""
148
+ You are a playful children's song singer.
149
+ Do NOT speak normally.
150
+ Perform this rhythmically and melodically like a song.
151
+
152
+ Rules:
153
+ - Stretch vowels
154
+ - Follow rhythm
155
+ - Pause between lines
156
+ - Raise pitch on questions
157
+ - Sound playful and musical
158
+
159
+ Start with a soft humming intro:
160
+ hmm-hmm-hmm ♪
161
+
162
+ Lyrics (sing line by line):
163
+
164
+ {lyrics}
165
+ """
166
 
167
  DEVICE = "cpu"
168
  MODEL = None
 
226
 
227
  def generate_tts_audio(
228
  text_input: str,
229
+ lyrics_input: str,
230
+ mode: str,
231
  language_id: str,
232
  audio_prompt_path_input: str = None,
233
  exaggeration_input: float = 0.5,
 
254
  if chosen_prompt:
255
  generate_kwargs["audio_prompt_path"] = chosen_prompt
256
 
257
+ # 🔀 Choose Speak vs Sing text
258
+ if mode == "Sing 🎵" and lyrics_input.strip():
259
+ final_text = format_for_singing(lyrics_input)
260
+ else:
261
+ final_text = text_input
262
+
263
  # 🔒 CPU-safe inference
264
  with torch.no_grad():
265
  wav = current_model.generate(
266
+ final_text[:300],
267
  language_id=language_id,
268
  **generate_kwargs
269
  )
270
 
 
271
  wav = wav.squeeze(0).detach().cpu().numpy()
272
+ return current_model.sr, wav
273
 
 
274
 
275
  with gr.Blocks() as demo:
276
  gr.Markdown(
277
  """
278
  # Chatterbox Multilingual Demo
279
+ Generate high-quality multilingual speech from text or lyrics (sing mode).
 
 
280
  """
281
  )
282
+
 
283
  gr.Markdown(get_supported_languages_display())
284
+
285
  with gr.Row():
286
  with gr.Column():
287
  initial_lang = "hi"
288
+
289
+ mode = gr.Radio(
290
+ choices=["Speak 🗣️", "Sing 🎵"],
291
+ value="Speak 🗣️",
292
+ label="Output Mode"
293
+ )
294
+
295
  text = gr.Textbox(
296
  value=default_text_for_ui(initial_lang),
297
+ label="Text (Speak mode)",
298
+ max_lines=4
299
  )
300
+
301
+ lyrics = gr.Textbox(
302
+ label="Lyrics (Sing mode)",
303
+ placeholder="Paste lyrics here (one line per verse)",
304
+ max_lines=10
305
+ )
306
+
307
  language_id = gr.Dropdown(
308
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
309
  value=initial_lang,
310
+ label="Language"
 
311
  )
312
+
313
  ref_wav = gr.Audio(
314
  sources=["upload", "microphone"],
315
  type="filepath",
316
+ label="Reference Audio (Optional)",
317
  value=default_audio_for_ui(initial_lang)
318
  )
319
+
 
 
 
 
 
320
  exaggeration = gr.Slider(
321
+ 0.25, 2, step=0.05,
322
+ label="Exaggeration",
323
+ value=0.5
324
  )
325
+
326
  cfg_weight = gr.Slider(
327
+ 0.2, 1, step=0.05,
328
+ label="CFG / Pace",
329
+ value=0.5
330
  )
331
 
332
  with gr.Accordion("More options", open=False):
333
+ seed_num = gr.Number(value=0, label="Random seed (0 = random)")
334
+ temp = gr.Slider(0.05, 5, step=0.05, label="Temperature", value=0.8)
335
 
336
  run_btn = gr.Button("Generate", variant="primary")
337
 
338
  with gr.Column():
339
  audio_output = gr.Audio(label="Output Audio")
340
 
341
+ # 🎛️ Auto-tune sliders for Sing mode
342
+ def on_mode_change(mode):
343
+ if mode == "Sing 🎵":
344
+ return 1.25, 1.0, 0.45
345
+ return 0.5, 0.8, 0.5
346
+
347
+ mode.change(
348
+ fn=on_mode_change,
349
+ inputs=mode,
350
+ outputs=[exaggeration, temp, cfg_weight],
351
+ show_progress=False
352
+ )
353
 
354
  run_btn.click(
355
  fn=generate_tts_audio,
356
  inputs=[
357
  text,
358
+ lyrics,
359
+ mode,
360
  language_id,
361
  ref_wav,
362
  exaggeration,
 
368
  )
369
 
370
  demo.launch(mcp_server=True)
371
+
372
+