Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on Oct 2, 2025

Commit

ff2ee9d

1 Parent(s): b193542

Update app.py with new features and optimizations

Browse files

Files changed (1) hide show

app.py +197 -97

app.py CHANGED Viewed

@@ -613,115 +613,215 @@ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as
     </div>
     """)
-    with gr.Row():
-        with gr.Column(scale=2):
-            text_input = gr.Textbox(
-                label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ) - Max 300 characters",
-                placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ... (អតិបរមា ៣០០ តួអក្សរ)",
-                lines=4,
-                max_lines=6,
-                interactive=True,
-                max_length=300
             )
-            # Simple character counter
-            char_info = gr.Textbox(
-                value="Characters: 0/300",
-                interactive=False,
-                show_label=False,
-                container=False,
-                elem_classes=["char-counter"]
             )
-            voice_input = gr.Dropdown(
-                ["kore", "puck"],
-                value="kore",
-                label="Voice (សំឡេង)",
-                info="Select a voice for the speech synthesis.",
-                interactive=True
             )
-            # Advanced Settings
-            with gr.Accordion("🔧 Advanced Settings", open=False):
-                with gr.Row():
-                    temperature = gr.Slider(
-                        minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                        label="Temperature",
-                        info="Higher values create more expressive speech"
                     )
-                    top_p = gr.Slider(
-                        minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                        label="Top P",
-                        info="Nucleus sampling threshold"
                     )
-                with gr.Row():
-                    repetition_penalty = gr.Slider(
-                        minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                        label="Repetition Penalty",
-                        info="Higher values discourage repetitive patterns"
                     )
-                    max_new_tokens = gr.Slider(
-                        minimum=100, maximum=8192, value=2048, step=10,
-                        label="Max Length",
-                        info="Maximum length of generated audio"
                     )
-            with gr.Row():
-                submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
-                clear_btn = gr.Button("🗑️ Clear", size="lg", elem_classes=["clear-btn"])
-        with gr.Column(scale=1):
-            audio_output = gr.Audio(
-                label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
-                type="numpy",
-                show_label=True,
-                interactive=False
             )
-    # Set up examples
-    gr.Examples(
-        examples=examples,
-        inputs=[text_input],
-        cache_examples=False,
-        label="📝 Example Texts (អត្ថបទគំរូ) - Click example then press Generate"
-    )
-    # Event handlers
-    # Character counter
-    text_input.blur(
-        fn=update_char_count,
-        inputs=[text_input],
-        outputs=[char_info]
-    )
-    # Generate speech
-    submit_btn.click(
-        fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
-            generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
-            update_char_count(text)
-        ],
-        inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
-        outputs=[audio_output, char_info],
-        show_progress=True
-    )
-    # Clear function
-    clear_btn.click(
-        fn=lambda: ("", None, "Characters: 0/300"),
-        inputs=[],
-        outputs=[text_input, audio_output, char_info]
-    )
-    # Keyboard shortcut
-    text_input.submit(
-        fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
-            generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
-            update_char_count(text)
-        ],
-        inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
-        outputs=[audio_output, char_info],
-        show_progress=True
-    )
 # Launch with embed-friendly optimizations
 if __name__ == "__main__":

     </div>
     """)
+    with gr.Tabs():
+        with gr.TabItem("🎤 Standard TTS"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    text_input = gr.Textbox(
+                        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ) - Max 300 characters",
+                        placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ... (អតិបរមា ៣០០ តួអក្សរ)",
+                        lines=4,
+                        max_lines=6,
+                        interactive=True,
+                        max_length=300
+                    )
+                    # Simple character counter
+                    char_info = gr.Textbox(
+                        value="Characters: 0/300",
+                        interactive=False,
+                        show_label=False,
+                        container=False,
+                        elem_classes=["char-counter"]
+                    )
+                    voice_input = gr.Dropdown(
+                        ["kore", "puck"],
+                        value="kore",
+                        label="Voice (សំឡេង)",
+                        info="Select a voice for the speech synthesis.",
+                        interactive=True
+                    )
+                    # Advanced Settings
+                    with gr.Accordion("🔧 Advanced Settings", open=False):
+                        with gr.Row():
+                            temperature = gr.Slider(
+                                minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                                label="Temperature",
+                                info="Higher values create more expressive speech"
+                            )
+                            top_p = gr.Slider(
+                                minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                                label="Top P",
+                                info="Nucleus sampling threshold"
+                            )
+                        with gr.Row():
+                            repetition_penalty = gr.Slider(
+                                minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                                label="Repetition Penalty",
+                                info="Higher values discourage repetitive patterns"
+                            )
+                            max_new_tokens = gr.Slider(
+                                minimum=100, maximum=8192, value=2048, step=10,
+                                label="Max Length",
+                                info="Maximum length of generated audio"
+                            )
+                    with gr.Row():
+                        submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
+                        clear_btn = gr.Button("🗑️ Clear", size="lg", elem_classes=["clear-btn"])
+                with gr.Column(scale=1):
+                    audio_output = gr.Audio(
+                        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
+                        type="numpy",
+                        show_label=True,
+                        interactive=False
+                    )
+            # Set up examples
+            gr.Examples(
+                examples=examples,
+                inputs=[text_input],
+                cache_examples=False,
+                label="📝 Example Texts (អត្ថបទគំរូ) - Click example then press Generate"
             )
+            # Event handlers
+            # Character counter
+            text_input.blur(
+                fn=update_char_count,
+                inputs=[text_input],
+                outputs=[char_info]
             )
+            # Generate speech
+            submit_btn.click(
+                fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
+                    generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
+                    update_char_count(text)
+                ],
+                inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
+                outputs=[audio_output, char_info],
+                show_progress=True
             )
+            # Clear function
+            clear_btn.click(
+                fn=lambda: ("", None, "Characters: 0/300"),
+                inputs=[],
+                outputs=[text_input, audio_output, char_info]
+            )
+            # Keyboard shortcut
+            text_input.submit(
+                fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
+                    generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
+                    update_char_count(text)
+                ],
+                inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
+                outputs=[audio_output, char_info],
+                show_progress=True
+            )
+        with gr.TabItem("🎭 Zero-Shot Voice Cloning"):
+            gr.Markdown("""
+            ### 🎭 Zero-Shot Voice Cloning
+            Upload a reference audio file and its transcript, then generate speech in that voice with new text!
+            **Instructions:**
+            1. Upload a clear audio file (5-30 seconds recommended)
+            2. Enter the exact transcript of what's said in the audio
+            3. Enter the new text you want to generate
+            4. Click Generate to create speech in the reference voice
+            """)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Reference audio upload
+                    ref_audio = gr.Audio(
+                        label="Reference Audio File",
+                        type="filepath",
+                        info="Upload a clear audio file (WAV, MP3, FLAC, M4A)"
+                    )
+                    # Transcript input
+                    ref_transcript = gr.Textbox(
+                        label="Reference Audio Transcript",
+                        placeholder="Enter exactly what is said in the reference audio...",
+                        lines=2,
+                        info="Type the exact words spoken in the reference audio"
                     )
+                    # Target text input
+                    target_text_input = gr.Textbox(
+                        label="Text to Generate - Max 300 characters",
+                        placeholder="Enter the text you want to generate in the reference voice...",
+                        lines=3,
+                        max_length=300,
+                        info="This text will be spoken in the reference voice"
                     )
+                    # Character counter for target text
+                    target_char_info = gr.Textbox(
+                        value="Characters: 0/300",
+                        interactive=False,
+                        show_label=False,
+                        container=False,
+                        elem_classes=["char-counter"]
                     )
+                    with gr.Row():
+                        zs_submit_btn = gr.Button("🎭 Generate Zero-Shot Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
+                        zs_clear_btn = gr.Button("🗑️ Clear All", size="lg", elem_classes=["clear-btn"])
+                with gr.Column(scale=1):
+                    zs_audio_output = gr.Audio(
+                        label="Generated Zero-Shot Speech",
+                        type="numpy",
+                        show_label=True,
+                        interactive=False
                     )
+            # Zero-shot examples
+            zs_examples = [
+                ["ជំរាបសួរ ខ្ញុំឈ្មោះ សុខា។", "សួស្តី អ្នកសុខសប្បាយទេ?"],
+                ["ថ្ងៃនេះអាកាសធាតុល្អ។", "ខ្ញុំចង់ទៅលេងសួនច្បារ។"],
+                ["ខ្ញុំចូលចិត្តញាំបាយ។", "តើអ្នកចូលចិត្តម្ហូបអ្វី?"]
+            ]
+            gr.Examples(
+                examples=zs_examples,
+                inputs=[ref_transcript, target_text_input],
+                label="📝 Example Transcript & Target Text Pairs"
+            )
+            # Zero-shot event handlers
+            # Character counter for target text
+            target_text_input.blur(
+                fn=update_char_count,
+                inputs=[target_text_input],
+                outputs=[target_char_info]
+            )
+            # Generate zero-shot speech
+            zs_submit_btn.click(
+                fn=lambda audio, transcript, target: [
+                    generate_zero_shot_speech(audio, transcript, target),
+                    update_char_count(target)
+                ],
+                inputs=[ref_audio, ref_transcript, target_text_input],
+                outputs=[zs_audio_output, target_char_info],
+                show_progress=True
+            )
+            # Clear zero-shot function
+            zs_clear_btn.click(
+                fn=lambda: (None, "", "", None, "Characters: 0/300"),
+                inputs=[],
+                outputs=[ref_audio, ref_transcript, target_text_input, zs_audio_output, target_char_info]
             )
 # Launch with embed-friendly optimizations
 if __name__ == "__main__":