Spaces:

pythonlearnreal
/

F5-TTS-THAI

Sleeping

App Files Files Community

F5-TTS-THAI / src /f5_tts /ui_components.py

pythonlearnreal

Upload folder using huggingface_hub

106478e verified 5 months ago

raw

history blame contribute delete

16.8 kB

	"""
	UI Components for F5-TTS Thai WebUI
	จัดการ Gradio UI components และการสร้าง interface
	"""

	import gradio as gr
	from f5_tts.config import (
	MODEL_CHOICES,
	DEFAULT_TTS_SETTINGS,
	EXAMPLES,
	TIPS_TEXT,
	MULTISPEECH_EXAMPLE_TEXT,
	MULTISPEECH_PLACEHOLDER,
	WHISPER_MODELS,
	WHISPER_COMPUTE_TYPES,
	WHISPER_LANGUAGES,
	MAX_SPEECH_TYPES,
	MAX_SEGMENTS
	)


	class UIComponents:
	"""จัดการ Gradio UI Components"""

	def __init__(self):
	self.speech_type_count = 1
	self.setup_speech_type_components()

	def setup_speech_type_components(self):
	"""สร้าง speech type components"""
	self.speech_type_rows = []
	self.speech_type_names = []
	self.speech_type_audios = []
	self.speech_type_ref_texts = []
	self.speech_type_delete_btns = []
	self.speech_type_insert_btns = []

	def create_model_selection_section(self):
	"""สร้างส่วนเลือกโมเดล"""
	with gr.Row():
	model_select = gr.Radio(
	label="โมเดล",
	choices=MODEL_CHOICES,
	value="Default",
	interactive=True,
	)
	model_custom = gr.Textbox(
	label="ตำแหน่งโมเดลแบบกำหนดเอง",
	value="hf://VIZINTZOR/F5-TTS-THAI/model_650000.pt",
	visible=False,
	interactive=True
	)
	model_status = gr.Textbox(label="สถานะโมเดล", value="")
	load_custom_btn = gr.Button("โหลด", variant="primary")

	return model_select, model_custom, model_status, load_custom_btn

	def create_tts_tab(self, infer_tts_fn=None):
	"""สร้าง Text To Speech tab"""
	with gr.Row():
	with gr.Column():
	ref_text = gr.Textbox(
	label="ข้อความต้นฉบับ",
	lines=1,
	info="แนะนำให้ใช้เสียงที่มีความยาวไม่เกิน 5-10 วินาที"
	)
	ref_audio = gr.Audio(label="เสียงต้นฉบับ", type="filepath")
	gen_text = gr.Textbox(label="ข้อความที่จะสร้าง", lines=4)
	generate_btn = gr.Button("สร้าง", variant="primary")

	with gr.Accordion(label="ตั้งค่า"):
	remove_silence = gr.Checkbox(
	label="Remove Silence",
	value=DEFAULT_TTS_SETTINGS["remove_silence"]
	)
	speed = gr.Slider(
	label="ความเร็ว",
	value=DEFAULT_TTS_SETTINGS["speed"],
	minimum=0.3, maximum=1.5, step=0.1
	)
	cross_fade_duration = gr.Slider(
	label="Cross Fade Duration",
	value=DEFAULT_TTS_SETTINGS["cross_fade_duration"],
	minimum=0, maximum=1, step=0.05
	)
	nfe_step = gr.Slider(
	label="NFE Step",
	value=DEFAULT_TTS_SETTINGS["nfe_step"],
	minimum=7, maximum=64, step=1,
	info="ยิ่งค่ามากยิ่งมีคุณภาพสูง แต่อาจจะช้าลง"
	)
	cfg_strength = gr.Slider(
	label="CFG Strength",
	value=DEFAULT_TTS_SETTINGS["cfg_strength"],
	minimum=1, maximum=4, step=0.1
	)
	max_chars = gr.Number(
	label="ตัวอักษรสูงสุดต่อส่วน",
	minimum=50, maximum=1000,
	value=DEFAULT_TTS_SETTINGS["max_chars"],
	info="จำนวนตัวอักษรสูงสุดที่ใช้ในการแบ่งส่วน สำหรับข้อความยาวๆ"
	)
	seed = gr.Number(
	label="Seed",
	value=DEFAULT_TTS_SETTINGS["seed"],
	precision=0,
	info="-1 = สุ่ม Seed"
	)
	no_ref_audio = gr.Checkbox(
	label="เสียงดั้งเดิม",
	value=DEFAULT_TTS_SETTINGS["no_ref_audio"],
	info="ใช้เสียงที่ไม่ผ่านการโคลนเสียงจากโมเดล"
	)

	with gr.Column():
	output_audio = gr.Audio(label="เสียงที่สร้าง", type="filepath")
	seed_output = gr.Textbox(label="Seed", interactive=False)
	output_spectrogram = gr.Image(label="Spectrogram")

	# Examples
	examples = gr.Examples(
	examples=EXAMPLES,
	inputs=[ref_audio, ref_text, gen_text],
	fn=infer_tts_fn,
	outputs=[output_audio, output_spectrogram, ref_text, seed_output],
	cache_examples=False,
	label="ตัวอย่าง"
	)

	# Tips
	tips = gr.Markdown("# คำแนะนำ")
	tips_content = gr.Markdown(TIPS_TEXT)

	return {
	'inputs': {
	'ref_audio': ref_audio,
	'ref_text': ref_text,
	'gen_text': gen_text,
	'remove_silence': remove_silence,
	'cross_fade_duration': cross_fade_duration,
	'nfe_step': nfe_step,
	'speed': speed,
	'cfg_strength': cfg_strength,
	'max_chars': max_chars,
	'seed': seed,
	'no_ref_audio': no_ref_audio
	},
	'outputs': {
	'output_audio': output_audio,
	'seed_output': seed_output,
	'spectrogram': output_spectrogram
	},
	'controls': {
	'generate_btn': generate_btn,
	'examples': examples
	}
	}

	def create_multispeech_tab(self):
	"""สร้าง Multi Speech tab"""
	gr.Markdown(MULTISPEECH_EXAMPLE_TEXT)
	gr.Markdown("""อัปโหลดคลิปเสียงที่แตกต่างกันสำหรับแต่ละประเภทคำพูด โดยประเภทคำพูดแรกเป็นประเภทที่จำเป็นต้องมี คุณสามารถเพิ่มประเภทคำพูดเพิ่มเติมได้โดยคลิกปุ่ม "เพิ่มประเภทคำพูด".""")

	# Regular speech type (mandatory)
	with gr.Row() as regular_row:
	with gr.Column():
	regular_name = gr.Textbox(value="ปกติ", label="ลักษณะอารมณ์/ชื่อผู้พูด")
	regular_insert = gr.Button("เพิ่มตัวกำกับ", variant="secondary")
	regular_audio = gr.Audio(label="เสียงต้นแบบ", type="filepath")
	regular_ref_text = gr.Textbox(label="ข้อความต้นฉบับ", lines=2)

	# Initialize lists
	speech_type_rows = [regular_row]
	speech_type_names = [regular_name]
	speech_type_audios = [regular_audio]
	speech_type_ref_texts = [regular_ref_text]
	speech_type_delete_btns = [None]
	speech_type_insert_btns = [regular_insert]

	# Additional speech types
	for i in range(MAX_SPEECH_TYPES - 1):
	with gr.Row(visible=False) as row:
	with gr.Column():
	name_input = gr.Textbox(label="ลักษณะอารมณ์/ชื่อผู้พูด")
	delete_btn = gr.Button("ลบ", variant="secondary")
	insert_btn = gr.Button("เพิ่มตัวกำกับ", variant="secondary")
	audio_input = gr.Audio(label="เสียงตัวอย่าง", type="filepath")
	ref_text_input = gr.Textbox(label="ข้อความต้นฉบับ", lines=2)

	speech_type_rows.append(row)
	speech_type_names.append(name_input)
	speech_type_audios.append(audio_input)
	speech_type_ref_texts.append(ref_text_input)
	speech_type_delete_btns.append(delete_btn)
	speech_type_insert_btns.append(insert_btn)

	add_speech_type_btn = gr.Button("เพิ่มประเภทคำพูด", variant="secondary")

	# Text input
	gen_text_input_multistyle = gr.Textbox(
	label="ข้อความ",
	lines=10,
	placeholder=MULTISPEECH_PLACEHOLDER,
	)

	# Settings
	with gr.Accordion("ตั้งค่า", open=False):
	remove_silence_multistyle = gr.Checkbox(label="Remove Silences", value=True)
	ms_cross_fade_duration = gr.Slider(label="Cross Fade Duration", value="0.15", minimum=0, maximum=1, step=0.05)
	ms_nfe_step = gr.Slider(label="NFE Step", value=32, minimum=16, maximum=64, step=8, info="ยิ่งค่ามากยิ่งมีคุณภาพสูง แต่จะช้าลง")

	# Generate button
	generate_multistyle_btn = gr.Button("สร้าง", variant="primary")

	# Output
	audio_output_multistyle = gr.Audio(label="เสียงที่สร้าง")
	download_btn_multistyle = gr.DownloadButton(label="ดาวน์โหลด", value=None, variant="secondary")

	# State components
	segments_state = gr.State([])
	sr_state = gr.State(24000)

	# Segment editing components
	with gr.Accordion("ปรับแต่ง", open=False):
	segment_players = []
	segment_text_inputs = []
	segment_silence_inputs = []
	segment_regen_btns = []

	for i in range(MAX_SEGMENTS):
	player = gr.Audio(label=f"เสียง Segment {i+1}", visible=False)
	text_input = gr.Textbox(label=f"แก้ไขข้อความ Segment {i+1}", visible=False)
	silence_input = gr.Number(label=f"เพิ่มเสียงเงียบ (ms) Segment {i+1}", value=0, visible=False)
	regen_btn = gr.Button(f"Regenerate Segment {i+1}", visible=False, variant="secondary")

	segment_players.append(player)
	segment_text_inputs.append(text_input)
	segment_silence_inputs.append(silence_input)
	segment_regen_btns.append(regen_btn)

	update_silence_btn = gr.Button("อัปเดต Silence ทั้งหมด", variant="secondary")

	# Store components for access
	self.speech_type_rows = speech_type_rows
	self.speech_type_names = speech_type_names
	self.speech_type_audios = speech_type_audios
	self.speech_type_ref_texts = speech_type_ref_texts
	self.speech_type_delete_btns = speech_type_delete_btns
	self.speech_type_insert_btns = speech_type_insert_btns

	return {
	'inputs': {
	'gen_text': gen_text_input_multistyle,
	'cross_fade_duration': ms_cross_fade_duration,
	'nfe_step': ms_nfe_step,
	'remove_silence': remove_silence_multistyle,
	'speech_type_names': speech_type_names,
	'speech_type_audios': speech_type_audios,
	'speech_type_ref_texts': speech_type_ref_texts,
	'segment_silence_inputs': segment_silence_inputs,
	'segment_text_inputs': segment_text_inputs
	},
	'outputs': {
	'audio_output': audio_output_multistyle,
	'download_btn': download_btn_multistyle,
	'segment_players': segment_players,
	'segment_text_inputs': segment_text_inputs,
	'segment_silence_inputs': segment_silence_inputs,
	'segment_regen_btns': segment_regen_btns
	},
	'controls': {
	'add_speech_type_btn': add_speech_type_btn,
	'generate_btn': generate_multistyle_btn,
	'update_silence_btn': update_silence_btn,
	'speech_type_rows': speech_type_rows
	},
	'state': {
	'segments_state': segments_state,
	'sr_state': sr_state
	}
	}

	def create_stt_tab(self):
	"""สร้าง Speech to Text tab"""
	gr.Markdown("เปลี่ยนเสียงพูดเป็นข้อความด้วย โมเดล [Whisper](https://github.com/openai/whisper) โดยใช้ [faster-whisper](https://github.com/SYSTRAN/faster-whisper)")

	with gr.Row():
	with gr.Column():
	ref_audio_input = gr.Audio(label="เสียงต้นฉบับ", type="filepath")
	is_translate = gr.Checkbox(label="แปลภาษา")
	generate_btn_stt = gr.Button("ถอดข้อความ", variant="primary")

	with gr.Accordion(label="ตั้งค่า", open=False):
	model_wp = gr.Dropdown(
	label="Model",
	choices=WHISPER_MODELS,
	value="large-v2"
	)
	compute_type = gr.Dropdown(
	label="Compute Type",
	choices=WHISPER_COMPUTE_TYPES,
	value="float16"
	)
	source_lg = gr.Dropdown(
	label="ภาษาต้นฉบับ",
	choices=WHISPER_LANGUAGES["source"],
	value="Auto"
	)
	target_lg = gr.Dropdown(
	label="ภาษาที่แปล",
	choices=WHISPER_LANGUAGES["target"],
	value="th"
	)

	with gr.Column():
	output_ref_text = gr.Textbox(
	label="ข้อความต้นฉบับ",
	lines=3,
	show_copy_button=True
	)

	return {
	'inputs': {
	'ref_audio_input': ref_audio_input,
	'is_translate': is_translate,
	'model_wp': model_wp,
	'compute_type': compute_type,
	'source_lg': source_lg,
	'target_lg': target_lg
	},
	'outputs': {
	'output_ref_text': output_ref_text
	},
	'controls': {
	'generate_btn_stt': generate_btn_stt
	}
	}

	def add_speech_type_fn(self):
	"""เพิ่ม speech type"""
	row_updates = [gr.update() for _ in range(MAX_SPEECH_TYPES)]
	if self.speech_type_count < MAX_SPEECH_TYPES:
	row_updates[self.speech_type_count] = gr.update(visible=True)
	self.speech_type_count += 1
	else:
	gr.Warning("ครบจำนวนสูงสุดของประเภทคำพูดแล้ว กรุณาเริ่มแอปใหม่")
	return row_updates

	@staticmethod
	def delete_speech_type_fn():
	"""ลบ speech type"""
	return gr.update(visible=False), None, None, None

	@staticmethod
	def make_insert_speech_type_fn(index):
	"""สร้างฟังก์ชันเพิ่มตัวกำกับ speech type"""
	def insert_speech_type_fn(current_text, speech_type_name):
	current_text = current_text or ""
	speech_type_name = speech_type_name or "None"
	updated_text = current_text + f"{{{speech_type_name}}} "
	return updated_text
	return insert_speech_type_fn