mrrtmob commited on
Commit
ff2ee9d
Β·
1 Parent(s): b193542

Update app.py with new features and optimizations

Browse files
Files changed (1) hide show
  1. app.py +197 -97
app.py CHANGED
@@ -613,115 +613,215 @@ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as
613
  </div>
614
  """)
615
 
616
- with gr.Row():
617
- with gr.Column(scale=2):
618
- text_input = gr.Textbox(
619
- label="Enter Khmer text (αž”αž‰αŸ’αž…αžΌαž›αž’αžαŸ’αžαž”αž‘αžαŸ’αž˜αŸ‚αžš) - Max 300 characters",
620
- placeholder="αž”αž‰αŸ’αž…αžΌαž›αž’αžαŸ’αžαž”αž‘αžαŸ’αž˜αŸ‚αžšαžšαž”αžŸαŸ‹αž’αŸ’αž“αž€αž“αŸ…αž‘αžΈαž“αŸαŸ‡... (αž’αžαž·αž”αžšαž˜αžΆ ៣០០ αžαž½αž’αž€αŸ’αžŸαžš)",
621
- lines=4,
622
- max_lines=6,
623
- interactive=True,
624
- max_length=300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625
  )
626
 
627
- # Simple character counter
628
- char_info = gr.Textbox(
629
- value="Characters: 0/300",
630
- interactive=False,
631
- show_label=False,
632
- container=False,
633
- elem_classes=["char-counter"]
634
  )
635
 
636
- voice_input = gr.Dropdown(
637
- ["kore", "puck"],
638
- value="kore",
639
- label="Voice (αžŸαŸ†αž‘αŸαž„)",
640
- info="Select a voice for the speech synthesis.",
641
- interactive=True
 
 
 
642
  )
643
 
644
- # Advanced Settings
645
- with gr.Accordion("πŸ”§ Advanced Settings", open=False):
646
- with gr.Row():
647
- temperature = gr.Slider(
648
- minimum=0.1, maximum=1.5, value=0.6, step=0.05,
649
- label="Temperature",
650
- info="Higher values create more expressive speech"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  )
652
- top_p = gr.Slider(
653
- minimum=0.1, maximum=1.0, value=0.95, step=0.05,
654
- label="Top P",
655
- info="Nucleus sampling threshold"
 
 
 
 
656
  )
657
- with gr.Row():
658
- repetition_penalty = gr.Slider(
659
- minimum=1.0, maximum=2.0, value=1.1, step=0.05,
660
- label="Repetition Penalty",
661
- info="Higher values discourage repetitive patterns"
 
 
 
662
  )
663
- max_new_tokens = gr.Slider(
664
- minimum=100, maximum=8192, value=2048, step=10,
665
- label="Max Length",
666
- info="Maximum length of generated audio"
 
 
 
 
 
 
 
667
  )
668
 
669
- with gr.Row():
670
- submit_btn = gr.Button("🎀 Generate Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
671
- clear_btn = gr.Button("πŸ—‘οΈ Clear", size="lg", elem_classes=["clear-btn"])
672
-
673
- with gr.Column(scale=1):
674
- audio_output = gr.Audio(
675
- label="Generated Speech (αžŸαŸ†αž›αŸαž„αžŠαŸ‚αž›αž”αž„αŸ’αž€αžΎαžαž‘αžΎαž„)",
676
- type="numpy",
677
- show_label=True,
678
- interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
679
  )
680
-
681
- # Set up examples
682
- gr.Examples(
683
- examples=examples,
684
- inputs=[text_input],
685
- cache_examples=False,
686
- label="πŸ“ Example Texts (αž’αžαŸ’αžαž”αž‘αž‚αŸ†αžšαžΌ) - Click example then press Generate"
687
- )
688
-
689
- # Event handlers
690
- # Character counter
691
- text_input.blur(
692
- fn=update_char_count,
693
- inputs=[text_input],
694
- outputs=[char_info]
695
- )
696
-
697
- # Generate speech
698
- submit_btn.click(
699
- fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
700
- generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
701
- update_char_count(text)
702
- ],
703
- inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
704
- outputs=[audio_output, char_info],
705
- show_progress=True
706
- )
707
-
708
- # Clear function
709
- clear_btn.click(
710
- fn=lambda: ("", None, "Characters: 0/300"),
711
- inputs=[],
712
- outputs=[text_input, audio_output, char_info]
713
- )
714
-
715
- # Keyboard shortcut
716
- text_input.submit(
717
- fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
718
- generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
719
- update_char_count(text)
720
- ],
721
- inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
722
- outputs=[audio_output, char_info],
723
- show_progress=True
724
- )
725
 
726
  # Launch with embed-friendly optimizations
727
  if __name__ == "__main__":
 
613
  </div>
614
  """)
615
 
616
+ with gr.Tabs():
617
+ with gr.TabItem("🎀 Standard TTS"):
618
+ with gr.Row():
619
+ with gr.Column(scale=2):
620
+ text_input = gr.Textbox(
621
+ label="Enter Khmer text (αž”αž‰αŸ’αž…αžΌαž›αž’αžαŸ’αžαž”αž‘αžαŸ’αž˜αŸ‚αžš) - Max 300 characters",
622
+ placeholder="αž”αž‰αŸ’αž…αžΌαž›αž’αžαŸ’αžαž”αž‘αžαŸ’αž˜αŸ‚αžšαžšαž”αžŸαŸ‹αž’αŸ’αž“αž€αž“αŸ…αž‘αžΈαž“αŸαŸ‡... (αž’αžαž·αž”αžšαž˜αžΆ ៣០០ αžαž½αž’αž€αŸ’αžŸαžš)",
623
+ lines=4,
624
+ max_lines=6,
625
+ interactive=True,
626
+ max_length=300
627
+ )
628
+
629
+ # Simple character counter
630
+ char_info = gr.Textbox(
631
+ value="Characters: 0/300",
632
+ interactive=False,
633
+ show_label=False,
634
+ container=False,
635
+ elem_classes=["char-counter"]
636
+ )
637
+
638
+ voice_input = gr.Dropdown(
639
+ ["kore", "puck"],
640
+ value="kore",
641
+ label="Voice (αžŸαŸ†αž‘αŸαž„)",
642
+ info="Select a voice for the speech synthesis.",
643
+ interactive=True
644
+ )
645
+
646
+ # Advanced Settings
647
+ with gr.Accordion("πŸ”§ Advanced Settings", open=False):
648
+ with gr.Row():
649
+ temperature = gr.Slider(
650
+ minimum=0.1, maximum=1.5, value=0.6, step=0.05,
651
+ label="Temperature",
652
+ info="Higher values create more expressive speech"
653
+ )
654
+ top_p = gr.Slider(
655
+ minimum=0.1, maximum=1.0, value=0.95, step=0.05,
656
+ label="Top P",
657
+ info="Nucleus sampling threshold"
658
+ )
659
+ with gr.Row():
660
+ repetition_penalty = gr.Slider(
661
+ minimum=1.0, maximum=2.0, value=1.1, step=0.05,
662
+ label="Repetition Penalty",
663
+ info="Higher values discourage repetitive patterns"
664
+ )
665
+ max_new_tokens = gr.Slider(
666
+ minimum=100, maximum=8192, value=2048, step=10,
667
+ label="Max Length",
668
+ info="Maximum length of generated audio"
669
+ )
670
+
671
+ with gr.Row():
672
+ submit_btn = gr.Button("🎀 Generate Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
673
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", size="lg", elem_classes=["clear-btn"])
674
+
675
+ with gr.Column(scale=1):
676
+ audio_output = gr.Audio(
677
+ label="Generated Speech (αžŸαŸ†αž›αŸαž„αžŠαŸ‚αž›αž”αž„αŸ’αž€αžΎαžαž‘αžΎαž„)",
678
+ type="numpy",
679
+ show_label=True,
680
+ interactive=False
681
+ )
682
+
683
+ # Set up examples
684
+ gr.Examples(
685
+ examples=examples,
686
+ inputs=[text_input],
687
+ cache_examples=False,
688
+ label="πŸ“ Example Texts (αž’αžαŸ’αžαž”αž‘αž‚αŸ†αžšαžΌ) - Click example then press Generate"
689
  )
690
 
691
+ # Event handlers
692
+ # Character counter
693
+ text_input.blur(
694
+ fn=update_char_count,
695
+ inputs=[text_input],
696
+ outputs=[char_info]
 
697
  )
698
 
699
+ # Generate speech
700
+ submit_btn.click(
701
+ fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
702
+ generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
703
+ update_char_count(text)
704
+ ],
705
+ inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
706
+ outputs=[audio_output, char_info],
707
+ show_progress=True
708
  )
709
 
710
+ # Clear function
711
+ clear_btn.click(
712
+ fn=lambda: ("", None, "Characters: 0/300"),
713
+ inputs=[],
714
+ outputs=[text_input, audio_output, char_info]
715
+ )
716
+
717
+ # Keyboard shortcut
718
+ text_input.submit(
719
+ fn=lambda text, voice, temp, top_p, rep_pen, max_tok: [
720
+ generate_speech(text, temp, top_p, rep_pen, max_tok, voice),
721
+ update_char_count(text)
722
+ ],
723
+ inputs=[text_input, voice_input, temperature, top_p, repetition_penalty, max_new_tokens],
724
+ outputs=[audio_output, char_info],
725
+ show_progress=True
726
+ )
727
+
728
+ with gr.TabItem("🎭 Zero-Shot Voice Cloning"):
729
+ gr.Markdown("""
730
+ ### 🎭 Zero-Shot Voice Cloning
731
+ Upload a reference audio file and its transcript, then generate speech in that voice with new text!
732
+
733
+ **Instructions:**
734
+ 1. Upload a clear audio file (5-30 seconds recommended)
735
+ 2. Enter the exact transcript of what's said in the audio
736
+ 3. Enter the new text you want to generate
737
+ 4. Click Generate to create speech in the reference voice
738
+ """)
739
+
740
+ with gr.Row():
741
+ with gr.Column(scale=2):
742
+ # Reference audio upload
743
+ ref_audio = gr.Audio(
744
+ label="Reference Audio File",
745
+ type="filepath",
746
+ info="Upload a clear audio file (WAV, MP3, FLAC, M4A)"
747
+ )
748
+
749
+ # Transcript input
750
+ ref_transcript = gr.Textbox(
751
+ label="Reference Audio Transcript",
752
+ placeholder="Enter exactly what is said in the reference audio...",
753
+ lines=2,
754
+ info="Type the exact words spoken in the reference audio"
755
  )
756
+
757
+ # Target text input
758
+ target_text_input = gr.Textbox(
759
+ label="Text to Generate - Max 300 characters",
760
+ placeholder="Enter the text you want to generate in the reference voice...",
761
+ lines=3,
762
+ max_length=300,
763
+ info="This text will be spoken in the reference voice"
764
  )
765
+
766
+ # Character counter for target text
767
+ target_char_info = gr.Textbox(
768
+ value="Characters: 0/300",
769
+ interactive=False,
770
+ show_label=False,
771
+ container=False,
772
+ elem_classes=["char-counter"]
773
  )
774
+
775
+ with gr.Row():
776
+ zs_submit_btn = gr.Button("🎭 Generate Zero-Shot Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
777
+ zs_clear_btn = gr.Button("πŸ—‘οΈ Clear All", size="lg", elem_classes=["clear-btn"])
778
+
779
+ with gr.Column(scale=1):
780
+ zs_audio_output = gr.Audio(
781
+ label="Generated Zero-Shot Speech",
782
+ type="numpy",
783
+ show_label=True,
784
+ interactive=False
785
  )
786
 
787
+ # Zero-shot examples
788
+ zs_examples = [
789
+ ["αž‡αŸ†αžšαžΆαž”αžŸαž½αžš αžαŸ’αž‰αž»αŸ†αžˆαŸ’αž˜αŸ„αŸ‡ αžŸαž»αžαžΆαŸ”", "αžŸαž½αžŸαŸ’αžαžΈ αž’αŸ’αž“αž€αžŸαž»αžαžŸαž”αŸ’αž”αžΆαž™αž‘αŸ?"],
790
+ ["αžαŸ’αž„αŸƒαž“αŸαŸ‡αž’αžΆαž€αžΆαžŸαž’αžΆαžαž»αž›αŸ’αž’αŸ”", "αžαŸ’αž‰αž»αŸ†αž…αž„αŸ‹αž‘αŸ…αž›αŸαž„αžŸαž½αž“αž…αŸ’αž”αžΆαžšαŸ”"],
791
+ ["αžαŸ’αž‰αž»αŸ†αž…αžΌαž›αž…αž·αžαŸ’αžαž‰αžΆαŸ†αž”αžΆαž™αŸ”", "αžαžΎαž’αŸ’αž“αž€αž…αžΌαž›αž…αž·αžαŸ’αžαž˜αŸ’αž αžΌαž”αž’αŸ’αžœαžΈ?"]
792
+ ]
793
+
794
+ gr.Examples(
795
+ examples=zs_examples,
796
+ inputs=[ref_transcript, target_text_input],
797
+ label="πŸ“ Example Transcript & Target Text Pairs"
798
+ )
799
+
800
+ # Zero-shot event handlers
801
+ # Character counter for target text
802
+ target_text_input.blur(
803
+ fn=update_char_count,
804
+ inputs=[target_text_input],
805
+ outputs=[target_char_info]
806
+ )
807
+
808
+ # Generate zero-shot speech
809
+ zs_submit_btn.click(
810
+ fn=lambda audio, transcript, target: [
811
+ generate_zero_shot_speech(audio, transcript, target),
812
+ update_char_count(target)
813
+ ],
814
+ inputs=[ref_audio, ref_transcript, target_text_input],
815
+ outputs=[zs_audio_output, target_char_info],
816
+ show_progress=True
817
+ )
818
+
819
+ # Clear zero-shot function
820
+ zs_clear_btn.click(
821
+ fn=lambda: (None, "", "", None, "Characters: 0/300"),
822
+ inputs=[],
823
+ outputs=[ref_audio, ref_transcript, target_text_input, zs_audio_output, target_char_info]
824
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
825
 
826
  # Launch with embed-friendly optimizations
827
  if __name__ == "__main__":