Spaces:
Running
on
Zero
Running
on
Zero
Update app.py with new features and optimizations
Browse files
app.py
CHANGED
|
@@ -613,115 +613,215 @@ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as
|
|
| 613 |
</div>
|
| 614 |
""")
|
| 615 |
|
| 616 |
-
with gr.
|
| 617 |
-
with gr.
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
)
|
| 626 |
|
| 627 |
-
#
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
elem_classes=["char-counter"]
|
| 634 |
)
|
| 635 |
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
|
|
|
|
|
|
|
|
|
| 642 |
)
|
| 643 |
|
| 644 |
-
#
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
)
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
)
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
|
|
|
|
|
|
|
|
|
| 662 |
)
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 667 |
)
|
| 668 |
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 679 |
)
|
| 680 |
-
|
| 681 |
-
# Set up examples
|
| 682 |
-
gr.Examples(
|
| 683 |
-
examples=examples,
|
| 684 |
-
inputs=[text_input],
|
| 685 |
-
cache_examples=False,
|
| 686 |
-
label="π Example Texts (α’αααααααααΌ) - Click example then press Generate"
|
| 687 |
-
)
|
| 688 |
-
|
| 689 |
-
# Event handlers
|
| 690 |
-
# Character counter
|
| 691 |
-
text_input.blur(
|
| 692 |
-
fn=update_char_count,
|
| 693 |
-
inputs=[text_input],
|
| 694 |
-
outputs=[char_info]
|
| 695 |
-
)
|
| 696 |
-
|
| 697 |
-
# Generate speech
|
| 698 |
-
submit_btn.click(
|
| 699 |
-
fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
|
| 700 |
-
generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
|
| 701 |
-
update_char_count(text)
|
| 702 |
-
],
|
| 703 |
-
inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
|
| 704 |
-
outputs=[audio_output, char_info],
|
| 705 |
-
show_progress=True
|
| 706 |
-
)
|
| 707 |
-
|
| 708 |
-
# Clear function
|
| 709 |
-
clear_btn.click(
|
| 710 |
-
fn=lambda: ("", None, "Characters: 0/300"),
|
| 711 |
-
inputs=[],
|
| 712 |
-
outputs=[text_input, audio_output, char_info]
|
| 713 |
-
)
|
| 714 |
-
|
| 715 |
-
# Keyboard shortcut
|
| 716 |
-
text_input.submit(
|
| 717 |
-
fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
|
| 718 |
-
generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
|
| 719 |
-
update_char_count(text)
|
| 720 |
-
],
|
| 721 |
-
inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
|
| 722 |
-
outputs=[audio_output, char_info],
|
| 723 |
-
show_progress=True
|
| 724 |
-
)
|
| 725 |
|
| 726 |
# Launch with embed-friendly optimizations
|
| 727 |
if __name__ == "__main__":
|
|
|
|
| 613 |
</div>
|
| 614 |
""")
|
| 615 |
|
| 616 |
+
with gr.Tabs():
|
| 617 |
+
with gr.TabItem("π€ Standard TTS"):
|
| 618 |
+
with gr.Row():
|
| 619 |
+
with gr.Column(scale=2):
|
| 620 |
+
text_input = gr.Textbox(
|
| 621 |
+
label="Enter Khmer text (αααα
αΌαα’αααααααααα) - Max 300 characters",
|
| 622 |
+
placeholder="αααα
αΌαα’ααααααααααααααα’ααααα
ααΈααα... (α’αα·ααααΆ α£α α αα½α’αααα)",
|
| 623 |
+
lines=4,
|
| 624 |
+
max_lines=6,
|
| 625 |
+
interactive=True,
|
| 626 |
+
max_length=300
|
| 627 |
+
)
|
| 628 |
+
|
| 629 |
+
# Simple character counter
|
| 630 |
+
char_info = gr.Textbox(
|
| 631 |
+
value="Characters: 0/300",
|
| 632 |
+
interactive=False,
|
| 633 |
+
show_label=False,
|
| 634 |
+
container=False,
|
| 635 |
+
elem_classes=["char-counter"]
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
voice_input = gr.Dropdown(
|
| 639 |
+
["kore", "puck"],
|
| 640 |
+
value="kore",
|
| 641 |
+
label="Voice (ααα‘αα)",
|
| 642 |
+
info="Select a voice for the speech synthesis.",
|
| 643 |
+
interactive=True
|
| 644 |
+
)
|
| 645 |
+
|
| 646 |
+
# Advanced Settings
|
| 647 |
+
with gr.Accordion("π§ Advanced Settings", open=False):
|
| 648 |
+
with gr.Row():
|
| 649 |
+
temperature = gr.Slider(
|
| 650 |
+
minimum=0.1, maximum=1.5, value=0.6, step=0.05,
|
| 651 |
+
label="Temperature",
|
| 652 |
+
info="Higher values create more expressive speech"
|
| 653 |
+
)
|
| 654 |
+
top_p = gr.Slider(
|
| 655 |
+
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
|
| 656 |
+
label="Top P",
|
| 657 |
+
info="Nucleus sampling threshold"
|
| 658 |
+
)
|
| 659 |
+
with gr.Row():
|
| 660 |
+
repetition_penalty = gr.Slider(
|
| 661 |
+
minimum=1.0, maximum=2.0, value=1.1, step=0.05,
|
| 662 |
+
label="Repetition Penalty",
|
| 663 |
+
info="Higher values discourage repetitive patterns"
|
| 664 |
+
)
|
| 665 |
+
max_new_tokens = gr.Slider(
|
| 666 |
+
minimum=100, maximum=8192, value=2048, step=10,
|
| 667 |
+
label="Max Length",
|
| 668 |
+
info="Maximum length of generated audio"
|
| 669 |
+
)
|
| 670 |
+
|
| 671 |
+
with gr.Row():
|
| 672 |
+
submit_btn = gr.Button("π€ Generate Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
|
| 673 |
+
clear_btn = gr.Button("ποΈ Clear", size="lg", elem_classes=["clear-btn"])
|
| 674 |
+
|
| 675 |
+
with gr.Column(scale=1):
|
| 676 |
+
audio_output = gr.Audio(
|
| 677 |
+
label="Generated Speech (αααααααααααααΎαα‘αΎα)",
|
| 678 |
+
type="numpy",
|
| 679 |
+
show_label=True,
|
| 680 |
+
interactive=False
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
# Set up examples
|
| 684 |
+
gr.Examples(
|
| 685 |
+
examples=examples,
|
| 686 |
+
inputs=[text_input],
|
| 687 |
+
cache_examples=False,
|
| 688 |
+
label="π Example Texts (α’αααααααααΌ) - Click example then press Generate"
|
| 689 |
)
|
| 690 |
|
| 691 |
+
# Event handlers
|
| 692 |
+
# Character counter
|
| 693 |
+
text_input.blur(
|
| 694 |
+
fn=update_char_count,
|
| 695 |
+
inputs=[text_input],
|
| 696 |
+
outputs=[char_info]
|
|
|
|
| 697 |
)
|
| 698 |
|
| 699 |
+
# Generate speech
|
| 700 |
+
submit_btn.click(
|
| 701 |
+
fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
|
| 702 |
+
generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
|
| 703 |
+
update_char_count(text)
|
| 704 |
+
],
|
| 705 |
+
inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
|
| 706 |
+
outputs=[audio_output, char_info],
|
| 707 |
+
show_progress=True
|
| 708 |
)
|
| 709 |
|
| 710 |
+
# Clear function
|
| 711 |
+
clear_btn.click(
|
| 712 |
+
fn=lambda: ("", None, "Characters: 0/300"),
|
| 713 |
+
inputs=[],
|
| 714 |
+
outputs=[text_input, audio_output, char_info]
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
# Keyboard shortcut
|
| 718 |
+
text_input.submit(
|
| 719 |
+
fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
|
| 720 |
+
generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
|
| 721 |
+
update_char_count(text)
|
| 722 |
+
],
|
| 723 |
+
inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
|
| 724 |
+
outputs=[audio_output, char_info],
|
| 725 |
+
show_progress=True
|
| 726 |
+
)
|
| 727 |
+
|
| 728 |
+
with gr.TabItem("π Zero-Shot Voice Cloning"):
|
| 729 |
+
gr.Markdown("""
|
| 730 |
+
### π Zero-Shot Voice Cloning
|
| 731 |
+
Upload a reference audio file and its transcript, then generate speech in that voice with new text!
|
| 732 |
+
|
| 733 |
+
**Instructions:**
|
| 734 |
+
1. Upload a clear audio file (5-30 seconds recommended)
|
| 735 |
+
2. Enter the exact transcript of what's said in the audio
|
| 736 |
+
3. Enter the new text you want to generate
|
| 737 |
+
4. Click Generate to create speech in the reference voice
|
| 738 |
+
""")
|
| 739 |
+
|
| 740 |
+
with gr.Row():
|
| 741 |
+
with gr.Column(scale=2):
|
| 742 |
+
# Reference audio upload
|
| 743 |
+
ref_audio = gr.Audio(
|
| 744 |
+
label="Reference Audio File",
|
| 745 |
+
type="filepath",
|
| 746 |
+
info="Upload a clear audio file (WAV, MP3, FLAC, M4A)"
|
| 747 |
+
)
|
| 748 |
+
|
| 749 |
+
# Transcript input
|
| 750 |
+
ref_transcript = gr.Textbox(
|
| 751 |
+
label="Reference Audio Transcript",
|
| 752 |
+
placeholder="Enter exactly what is said in the reference audio...",
|
| 753 |
+
lines=2,
|
| 754 |
+
info="Type the exact words spoken in the reference audio"
|
| 755 |
)
|
| 756 |
+
|
| 757 |
+
# Target text input
|
| 758 |
+
target_text_input = gr.Textbox(
|
| 759 |
+
label="Text to Generate - Max 300 characters",
|
| 760 |
+
placeholder="Enter the text you want to generate in the reference voice...",
|
| 761 |
+
lines=3,
|
| 762 |
+
max_length=300,
|
| 763 |
+
info="This text will be spoken in the reference voice"
|
| 764 |
)
|
| 765 |
+
|
| 766 |
+
# Character counter for target text
|
| 767 |
+
target_char_info = gr.Textbox(
|
| 768 |
+
value="Characters: 0/300",
|
| 769 |
+
interactive=False,
|
| 770 |
+
show_label=False,
|
| 771 |
+
container=False,
|
| 772 |
+
elem_classes=["char-counter"]
|
| 773 |
)
|
| 774 |
+
|
| 775 |
+
with gr.Row():
|
| 776 |
+
zs_submit_btn = gr.Button("π Generate Zero-Shot Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
|
| 777 |
+
zs_clear_btn = gr.Button("ποΈ Clear All", size="lg", elem_classes=["clear-btn"])
|
| 778 |
+
|
| 779 |
+
with gr.Column(scale=1):
|
| 780 |
+
zs_audio_output = gr.Audio(
|
| 781 |
+
label="Generated Zero-Shot Speech",
|
| 782 |
+
type="numpy",
|
| 783 |
+
show_label=True,
|
| 784 |
+
interactive=False
|
| 785 |
)
|
| 786 |
|
| 787 |
+
# Zero-shot examples
|
| 788 |
+
zs_examples = [
|
| 789 |
+
["ααααΆααα½α αααα»αααααα αα»ααΆα", "αα½ααααΈ α’ααααα»ααααααΆααα?"],
|
| 790 |
+
["αααααααα’αΆααΆαααΆαα»ααα’α", "αααα»αα
αααα
ααααα½αα
αααΆαα"],
|
| 791 |
+
["αααα»αα
αΌαα
α·αααααΆαααΆαα", "ααΎα’αααα
αΌαα
α·αααααα αΌαα’αααΈ?"]
|
| 792 |
+
]
|
| 793 |
+
|
| 794 |
+
gr.Examples(
|
| 795 |
+
examples=zs_examples,
|
| 796 |
+
inputs=[ref_transcript, target_text_input],
|
| 797 |
+
label="π Example Transcript & Target Text Pairs"
|
| 798 |
+
)
|
| 799 |
+
|
| 800 |
+
# Zero-shot event handlers
|
| 801 |
+
# Character counter for target text
|
| 802 |
+
target_text_input.blur(
|
| 803 |
+
fn=update_char_count,
|
| 804 |
+
inputs=[target_text_input],
|
| 805 |
+
outputs=[target_char_info]
|
| 806 |
+
)
|
| 807 |
+
|
| 808 |
+
# Generate zero-shot speech
|
| 809 |
+
zs_submit_btn.click(
|
| 810 |
+
fn=lambda audio, transcript, target: [
|
| 811 |
+
generate_zero_shot_speech(audio, transcript, target),
|
| 812 |
+
update_char_count(target)
|
| 813 |
+
],
|
| 814 |
+
inputs=[ref_audio, ref_transcript, target_text_input],
|
| 815 |
+
outputs=[zs_audio_output, target_char_info],
|
| 816 |
+
show_progress=True
|
| 817 |
+
)
|
| 818 |
+
|
| 819 |
+
# Clear zero-shot function
|
| 820 |
+
zs_clear_btn.click(
|
| 821 |
+
fn=lambda: (None, "", "", None, "Characters: 0/300"),
|
| 822 |
+
inputs=[],
|
| 823 |
+
outputs=[ref_audio, ref_transcript, target_text_input, zs_audio_output, target_char_info]
|
| 824 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 825 |
|
| 826 |
# Launch with embed-friendly optimizations
|
| 827 |
if __name__ == "__main__":
|