Spaces:

pythonlearnreal
/

F5-TTS-THAI

Running

File size: 16,784 Bytes

106478e

"""
UI Components for F5-TTS Thai WebUI
จัดการ Gradio UI components และการสร้าง interface
"""

import gradio as gr
from f5_tts.config import (
    MODEL_CHOICES, 
    DEFAULT_TTS_SETTINGS, 
    EXAMPLES, 
    TIPS_TEXT,
    MULTISPEECH_EXAMPLE_TEXT,
    MULTISPEECH_PLACEHOLDER,
    WHISPER_MODELS,
    WHISPER_COMPUTE_TYPES,
    WHISPER_LANGUAGES,
    MAX_SPEECH_TYPES,
    MAX_SEGMENTS
)


class UIComponents:
    """จัดการ Gradio UI Components"""
    
    def __init__(self):
        self.speech_type_count = 1
        self.setup_speech_type_components()
    
    def setup_speech_type_components(self):
        """สร้าง speech type components"""
        self.speech_type_rows = []
        self.speech_type_names = []
        self.speech_type_audios = []
        self.speech_type_ref_texts = []
        self.speech_type_delete_btns = []
        self.speech_type_insert_btns = []
    
    def create_model_selection_section(self):
        """สร้างส่วนเลือกโมเดล"""
        with gr.Row():
            model_select = gr.Radio(
                label="โมเดล",
                choices=MODEL_CHOICES,
                value="Default",
                interactive=True,
            )
            model_custom = gr.Textbox(
                label="ตำแหน่งโมเดลแบบกำหนดเอง",
                value="hf://VIZINTZOR/F5-TTS-THAI/model_650000.pt", 
                visible=False, 
                interactive=True
            )
            model_status = gr.Textbox(label="สถานะโมเดล", value="")
            load_custom_btn = gr.Button("โหลด", variant="primary")
        
        return model_select, model_custom, model_status, load_custom_btn
    
    def create_tts_tab(self, infer_tts_fn=None):
        """สร้าง Text To Speech tab"""
        with gr.Row():
            with gr.Column():
                ref_text = gr.Textbox(
                    label="ข้อความต้นฉบับ", 
                    lines=1, 
                    info="แนะนำให้ใช้เสียงที่มีความยาวไม่เกิน 5-10 วินาที"
                )
                ref_audio = gr.Audio(label="เสียงต้นฉบับ", type="filepath")
                gen_text = gr.Textbox(label="ข้อความที่จะสร้าง", lines=4)
                generate_btn = gr.Button("สร้าง", variant="primary")

                with gr.Accordion(label="ตั้งค่า"):
                    remove_silence = gr.Checkbox(
                        label="Remove Silence", 
                        value=DEFAULT_TTS_SETTINGS["remove_silence"]
                    )
                    speed = gr.Slider(
                        label="ความเร็ว", 
                        value=DEFAULT_TTS_SETTINGS["speed"], 
                        minimum=0.3, maximum=1.5, step=0.1
                    )
                    cross_fade_duration = gr.Slider(
                        label="Cross Fade Duration", 
                        value=DEFAULT_TTS_SETTINGS["cross_fade_duration"], 
                        minimum=0, maximum=1, step=0.05
                    )
                    nfe_step = gr.Slider(
                        label="NFE Step", 
                        value=DEFAULT_TTS_SETTINGS["nfe_step"], 
                        minimum=7, maximum=64, step=1, 
                        info="ยิ่งค่ามากยิ่งมีคุณภาพสูง แต่อาจจะช้าลง"
                    )
                    cfg_strength = gr.Slider(
                        label="CFG Strength", 
                        value=DEFAULT_TTS_SETTINGS["cfg_strength"], 
                        minimum=1, maximum=4, step=0.1
                    )
                    max_chars = gr.Number(
                        label="ตัวอักษรสูงสุดต่อส่วน", 
                        minimum=50, maximum=1000, 
                        value=DEFAULT_TTS_SETTINGS["max_chars"],
                        info="จำนวนตัวอักษรสูงสุดที่ใช้ในการแบ่งส่วน สำหรับข้อความยาวๆ"
                    )
                    seed = gr.Number(
                        label="Seed", 
                        value=DEFAULT_TTS_SETTINGS["seed"], 
                        precision=0, 
                        info="-1 = สุ่ม Seed"
                    )
                    no_ref_audio = gr.Checkbox(
                        label="เสียงดั้งเดิม", 
                        value=DEFAULT_TTS_SETTINGS["no_ref_audio"],
                        info="ใช้เสียงที่ไม่ผ่านการโคลนเสียงจากโมเดล"
                    )
                    
            with gr.Column():
                output_audio = gr.Audio(label="เสียงที่สร้าง", type="filepath")
                seed_output = gr.Textbox(label="Seed", interactive=False)
                output_spectrogram = gr.Image(label="Spectrogram")
        
        # Examples
        examples = gr.Examples(
            examples=EXAMPLES,
            inputs=[ref_audio, ref_text, gen_text],
            fn=infer_tts_fn,
            outputs=[output_audio, output_spectrogram, ref_text, seed_output],
            cache_examples=False,
            label="ตัวอย่าง"
        )
        
        # Tips
        tips = gr.Markdown("# คำแนะนำ")
        tips_content = gr.Markdown(TIPS_TEXT)
        
        return {
            'inputs': {
                'ref_audio': ref_audio,
                'ref_text': ref_text,
                'gen_text': gen_text,
                'remove_silence': remove_silence,
                'cross_fade_duration': cross_fade_duration,
                'nfe_step': nfe_step,
                'speed': speed,
                'cfg_strength': cfg_strength,
                'max_chars': max_chars,
                'seed': seed,
                'no_ref_audio': no_ref_audio
            },
            'outputs': {
                'output_audio': output_audio,
                'seed_output': seed_output,
                'spectrogram': output_spectrogram
            },
            'controls': {
                'generate_btn': generate_btn,
                'examples': examples
            }
        }
    
    def create_multispeech_tab(self):
        """สร้าง Multi Speech tab"""
        gr.Markdown(MULTISPEECH_EXAMPLE_TEXT)
        gr.Markdown("""อัปโหลดคลิปเสียงที่แตกต่างกันสำหรับแต่ละประเภทคำพูด โดยประเภทคำพูดแรกเป็นประเภทที่จำเป็นต้องมี คุณสามารถเพิ่มประเภทคำพูดเพิ่มเติมได้โดยคลิกปุ่ม "เพิ่มประเภทคำพูด".""")

        # Regular speech type (mandatory)
        with gr.Row() as regular_row:
            with gr.Column():
                regular_name = gr.Textbox(value="ปกติ", label="ลักษณะอารมณ์/ชื่อผู้พูด")
                regular_insert = gr.Button("เพิ่มตัวกำกับ", variant="secondary")
            regular_audio = gr.Audio(label="เสียงต้นแบบ", type="filepath")
            regular_ref_text = gr.Textbox(label="ข้อความต้นฉบับ", lines=2)

        # Initialize lists
        speech_type_rows = [regular_row]
        speech_type_names = [regular_name]
        speech_type_audios = [regular_audio]
        speech_type_ref_texts = [regular_ref_text]
        speech_type_delete_btns = [None]
        speech_type_insert_btns = [regular_insert]

        # Additional speech types
        for i in range(MAX_SPEECH_TYPES - 1):
            with gr.Row(visible=False) as row:
                with gr.Column():
                    name_input = gr.Textbox(label="ลักษณะอารมณ์/ชื่อผู้พูด")
                    delete_btn = gr.Button("ลบ", variant="secondary")
                    insert_btn = gr.Button("เพิ่มตัวกำกับ", variant="secondary")
                audio_input = gr.Audio(label="เสียงตัวอย่าง", type="filepath")
                ref_text_input = gr.Textbox(label="ข้อความต้นฉบับ", lines=2)
            
            speech_type_rows.append(row)
            speech_type_names.append(name_input)
            speech_type_audios.append(audio_input)
            speech_type_ref_texts.append(ref_text_input)
            speech_type_delete_btns.append(delete_btn)
            speech_type_insert_btns.append(insert_btn)

        add_speech_type_btn = gr.Button("เพิ่มประเภทคำพูด", variant="secondary")

        # Text input
        gen_text_input_multistyle = gr.Textbox(
            label="ข้อความ",
            lines=10,
            placeholder=MULTISPEECH_PLACEHOLDER,
        )

        # Settings
        with gr.Accordion("ตั้งค่า", open=False):
            remove_silence_multistyle = gr.Checkbox(label="Remove Silences", value=True)
            ms_cross_fade_duration = gr.Slider(label="Cross Fade Duration", value="0.15", minimum=0, maximum=1, step=0.05)
            ms_nfe_step = gr.Slider(label="NFE Step", value=32, minimum=16, maximum=64, step=8, info="ยิ่งค่ามากยิ่งมีคุณภาพสูง แต่จะช้าลง")

        # Generate button
        generate_multistyle_btn = gr.Button("สร้าง", variant="primary")

        # Output
        audio_output_multistyle = gr.Audio(label="เสียงที่สร้าง")
        download_btn_multistyle = gr.DownloadButton(label="ดาวน์โหลด", value=None, variant="secondary")

        # State components
        segments_state = gr.State([])
        sr_state = gr.State(24000)

        # Segment editing components
        with gr.Accordion("ปรับแต่ง", open=False):
            segment_players = []
            segment_text_inputs = []
            segment_silence_inputs = []
            segment_regen_btns = []
            
            for i in range(MAX_SEGMENTS):
                player = gr.Audio(label=f"เสียง Segment {i+1}", visible=False)
                text_input = gr.Textbox(label=f"แก้ไขข้อความ Segment {i+1}", visible=False)
                silence_input = gr.Number(label=f"เพิ่มเสียงเงียบ (ms) Segment {i+1}", value=0, visible=False)
                regen_btn = gr.Button(f"Regenerate Segment {i+1}", visible=False, variant="secondary")
                
                segment_players.append(player)
                segment_text_inputs.append(text_input)
                segment_silence_inputs.append(silence_input)
                segment_regen_btns.append(regen_btn)
            
            update_silence_btn = gr.Button("อัปเดต Silence ทั้งหมด", variant="secondary")

        # Store components for access
        self.speech_type_rows = speech_type_rows
        self.speech_type_names = speech_type_names
        self.speech_type_audios = speech_type_audios
        self.speech_type_ref_texts = speech_type_ref_texts
        self.speech_type_delete_btns = speech_type_delete_btns
        self.speech_type_insert_btns = speech_type_insert_btns

        return {
            'inputs': {
                'gen_text': gen_text_input_multistyle,
                'cross_fade_duration': ms_cross_fade_duration,
                'nfe_step': ms_nfe_step,
                'remove_silence': remove_silence_multistyle,
                'speech_type_names': speech_type_names,
                'speech_type_audios': speech_type_audios,
                'speech_type_ref_texts': speech_type_ref_texts,
                'segment_silence_inputs': segment_silence_inputs,
                'segment_text_inputs': segment_text_inputs
            },
            'outputs': {
                'audio_output': audio_output_multistyle,
                'download_btn': download_btn_multistyle,
                'segment_players': segment_players,
                'segment_text_inputs': segment_text_inputs,
                'segment_silence_inputs': segment_silence_inputs,
                'segment_regen_btns': segment_regen_btns
            },
            'controls': {
                'add_speech_type_btn': add_speech_type_btn,
                'generate_btn': generate_multistyle_btn,
                'update_silence_btn': update_silence_btn,
                'speech_type_rows': speech_type_rows
            },
            'state': {
                'segments_state': segments_state,
                'sr_state': sr_state
            }
        }
    
    def create_stt_tab(self):
        """สร้าง Speech to Text tab"""
        gr.Markdown("เปลี่ยนเสียงพูดเป็นข้อความด้วย โมเดล [Whisper](https://github.com/openai/whisper) โดยใช้ [faster-whisper](https://github.com/SYSTRAN/faster-whisper)")
        
        with gr.Row():
            with gr.Column():
                ref_audio_input = gr.Audio(label="เสียงต้นฉบับ", type="filepath")
                is_translate = gr.Checkbox(label="แปลภาษา")
                generate_btn_stt = gr.Button("ถอดข้อความ", variant="primary")

                with gr.Accordion(label="ตั้งค่า", open=False):
                    model_wp = gr.Dropdown(
                        label="Model",
                        choices=WHISPER_MODELS,
                        value="large-v2"
                    )
                    compute_type = gr.Dropdown(
                        label="Compute Type",
                        choices=WHISPER_COMPUTE_TYPES,
                        value="float16"
                    )
                    source_lg = gr.Dropdown(
                        label="ภาษาต้นฉบับ",
                        choices=WHISPER_LANGUAGES["source"],
                        value="Auto"
                    )
                    target_lg = gr.Dropdown(
                        label="ภาษาที่แปล",
                        choices=WHISPER_LANGUAGES["target"],
                        value="th"
                    )

            with gr.Column():
                output_ref_text = gr.Textbox(
                    label="ข้อความต้นฉบับ",
                    lines=3,
                    show_copy_button=True
                )
        
        return {
            'inputs': {
                'ref_audio_input': ref_audio_input,
                'is_translate': is_translate,
                'model_wp': model_wp,
                'compute_type': compute_type,
                'source_lg': source_lg,
                'target_lg': target_lg
            },
            'outputs': {
                'output_ref_text': output_ref_text
            },
            'controls': {
                'generate_btn_stt': generate_btn_stt
            }
        }
    
    def add_speech_type_fn(self):
        """เพิ่ม speech type"""
        row_updates = [gr.update() for _ in range(MAX_SPEECH_TYPES)]
        if self.speech_type_count < MAX_SPEECH_TYPES:
            row_updates[self.speech_type_count] = gr.update(visible=True)
            self.speech_type_count += 1
        else:
            gr.Warning("ครบจำนวนสูงสุดของประเภทคำพูดแล้ว กรุณาเริ่มแอปใหม่")
        return row_updates
    
    @staticmethod
    def delete_speech_type_fn():
        """ลบ speech type"""
        return gr.update(visible=False), None, None, None
    
    @staticmethod
    def make_insert_speech_type_fn(index):
        """สร้างฟังก์ชันเพิ่มตัวกำกับ speech type"""
        def insert_speech_type_fn(current_text, speech_type_name):
            current_text = current_text or ""
            speech_type_name = speech_type_name or "None"
            updated_text = current_text + f"{{{speech_type_name}}} "
            return updated_text
        return insert_speech_type_fn