Chatterbox-Multilingual-TTS-API

Sleeping

App Files Files Community

rahul7star commited on Jan 7

Commit

1526f1e

verified ·

1 Parent(s): d651e33

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -34

app.py CHANGED Viewed

@@ -143,6 +143,26 @@ def get_supported_languages_display() -> str:
 {line2}
 """
 DEVICE = "cpu"
 MODEL = None
@@ -206,6 +226,8 @@ def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | N
 def generate_tts_audio(
     text_input: str,
     language_id: str,
     audio_prompt_path_input: str = None,
     exaggeration_input: float = 0.5,
@@ -232,89 +254,109 @@ def generate_tts_audio(
     if chosen_prompt:
         generate_kwargs["audio_prompt_path"] = chosen_prompt
     # 🔒 CPU-safe inference
     with torch.no_grad():
         wav = current_model.generate(
-            text_input[:300],
             language_id=language_id,
             **generate_kwargs
         )
-    # Ensure CPU numpy conversion
     wav = wav.squeeze(0).detach().cpu().numpy()
-    return (current_model.sr, wav)
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         # Chatterbox Multilingual Demo
-        Generate high-quality multilingual speech from text with reference audio styling, supporting 23 languages.
-        For a hosted version of Chatterbox Multilingual and for finetuning, please visit [resemble.ai](https://app.resemble.ai)
         """
     )
-    # Display supported languages
     gr.Markdown(get_supported_languages_display())
     with gr.Row():
         with gr.Column():
             initial_lang = "hi"
             text = gr.Textbox(
                 value=default_text_for_ui(initial_lang),
-                label="Text to synthesize (max chars 300)",
-                max_lines=5
             )
             language_id = gr.Dropdown(
                 choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                 value=initial_lang,
-                label="Language",
-                info="Select the language for text-to-speech synthesis"
             )
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
                 type="filepath",
-                label="Reference Audio File (Optional)",
                 value=default_audio_for_ui(initial_lang)
             )
-            gr.Markdown(
-                "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
-                elem_classes=["audio-note"]
-            )
             exaggeration = gr.Slider(
-                0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
             )
             cfg_weight = gr.Slider(
-                0.2, 1, step=.05, label="CFG/Pace", value=0.5
             )
             with gr.Accordion("More options", open=False):
-                seed_num = gr.Number(value=0, label="Random seed (0 for random)")
-                temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
             run_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Output Audio")
-        def on_language_change(lang, current_ref, current_text):
-            return default_audio_for_ui(lang), default_text_for_ui(lang)
-        language_id.change(
-            fn=on_language_change,
-            inputs=[language_id, ref_wav, text],
-            outputs=[ref_wav, text],
-            show_progress=False
-        )
     run_btn.click(
         fn=generate_tts_audio,
         inputs=[
             text,
             language_id,
             ref_wav,
             exaggeration,
@@ -326,3 +368,5 @@ with gr.Blocks() as demo:
     )
 demo.launch(mcp_server=True)

 {line2}
 """
+def format_for_singing(lyrics: str) -> str:
+    return f"""
+You are a playful children's song singer.
+Do NOT speak normally.
+Perform this rhythmically and melodically like a song.
+Rules:
+- Stretch vowels
+- Follow rhythm
+- Pause between lines
+- Raise pitch on questions
+- Sound playful and musical
+Start with a soft humming intro:
+hmm-hmm-hmm ♪
+Lyrics (sing line by line):
+{lyrics}
+"""
 DEVICE = "cpu"
 MODEL = None
 def generate_tts_audio(
     text_input: str,
+    lyrics_input: str,
+    mode: str,
     language_id: str,
     audio_prompt_path_input: str = None,
     exaggeration_input: float = 0.5,
     if chosen_prompt:
         generate_kwargs["audio_prompt_path"] = chosen_prompt
+    # 🔀 Choose Speak vs Sing text
+    if mode == "Sing 🎵" and lyrics_input.strip():
+        final_text = format_for_singing(lyrics_input)
+    else:
+        final_text = text_input
     # 🔒 CPU-safe inference
     with torch.no_grad():
         wav = current_model.generate(
+            final_text[:300],
             language_id=language_id,
             **generate_kwargs
         )
     wav = wav.squeeze(0).detach().cpu().numpy()
+    return current_model.sr, wav
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         # Chatterbox Multilingual Demo
+        Generate high-quality multilingual speech from text or lyrics (sing mode).
         """
     )
     gr.Markdown(get_supported_languages_display())
     with gr.Row():
         with gr.Column():
             initial_lang = "hi"
+            mode = gr.Radio(
+                choices=["Speak 🗣️", "Sing 🎵"],
+                value="Speak 🗣️",
+                label="Output Mode"
+            )
             text = gr.Textbox(
                 value=default_text_for_ui(initial_lang),
+                label="Text (Speak mode)",
+                max_lines=4
             )
+            lyrics = gr.Textbox(
+                label="Lyrics (Sing mode)",
+                placeholder="Paste lyrics here (one line per verse)",
+                max_lines=10
+            )
             language_id = gr.Dropdown(
                 choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                 value=initial_lang,
+                label="Language"
             )
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
                 type="filepath",
+                label="Reference Audio (Optional)",
                 value=default_audio_for_ui(initial_lang)
             )
             exaggeration = gr.Slider(
+                0.25, 2, step=0.05,
+                label="Exaggeration",
+                value=0.5
             )
             cfg_weight = gr.Slider(
+                0.2, 1, step=0.05,
+                label="CFG / Pace",
+                value=0.5
             )
             with gr.Accordion("More options", open=False):
+                seed_num = gr.Number(value=0, label="Random seed (0 = random)")
+                temp = gr.Slider(0.05, 5, step=0.05, label="Temperature", value=0.8)
             run_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Output Audio")
+    # 🎛️ Auto-tune sliders for Sing mode
+    def on_mode_change(mode):
+        if mode == "Sing 🎵":
+            return 1.25, 1.0, 0.45
+        return 0.5, 0.8, 0.5
+    mode.change(
+        fn=on_mode_change,
+        inputs=mode,
+        outputs=[exaggeration, temp, cfg_weight],
+        show_progress=False
+    )
     run_btn.click(
         fn=generate_tts_audio,
         inputs=[
             text,
+            lyrics,
+            mode,
             language_id,
             ref_wav,
             exaggeration,
     )
 demo.launch(mcp_server=True)