""" TTS Processor for F5-TTS Thai จัดการการประมวลผล text-to-speech """ import random import sys import tempfile import numpy as np import gradio as gr import soundfile as sf import torchaudio from f5_tts.infer.utils_infer import ( infer_process, preprocess_ref_audio_text, remove_silence_for_generated_wav, save_spectrogram, ) from f5_tts.model.utils import seed_everything from f5_tts.cleantext.number_tha import replace_numbers_with_thai from f5_tts.cleantext.th_repeat import process_thai_repeat from f5_tts.utils.whisper_api import translate_inference, transribe_inference from f5_tts.config import DEFAULT_TTS_SETTINGS class TTSProcessor: """จัดการการประมวลผล Text-to-Speech""" def __init__(self, model_manager): self.model_manager = model_manager def infer_tts(self, ref_audio_orig, ref_text, gen_text, remove_silence=DEFAULT_TTS_SETTINGS["remove_silence"], cross_fade_duration=DEFAULT_TTS_SETTINGS["cross_fade_duration"], nfe_step=DEFAULT_TTS_SETTINGS["nfe_step"], speed=DEFAULT_TTS_SETTINGS["speed"], cfg_strength=DEFAULT_TTS_SETTINGS["cfg_strength"], max_chars=DEFAULT_TTS_SETTINGS["max_chars"], seed=DEFAULT_TTS_SETTINGS["seed"], no_ref_audio=DEFAULT_TTS_SETTINGS["no_ref_audio"]): """ประมวลผล TTS""" # ตั้งค่า seed if seed == -1: seed = random.randint(0, sys.maxsize) seed_everything(seed) output_seed = seed # ตรวจสอบ input if not ref_audio_orig: gr.Warning("กรุณาใส่เสียงต้นฉบับ") return gr.update(), gr.update(), ref_text, output_seed if not gen_text.strip(): gr.Warning("กรุณาใส่ข้อความที่จะสร้าง") return gr.update(), gr.update(), ref_text, output_seed # เตรียมข้อมูล ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text) cross_fade_duration = float(cross_fade_duration) # ประมวลผลข้อความ gen_text_cleaned = process_thai_repeat(replace_numbers_with_thai(gen_text)) # ดึงโมเดล f5tts_model = self.model_manager.get_model() vocoder = self.model_manager.get_vocoder() # สร้างเสียง final_wave, final_sample_rate, combined_spectrogram = infer_process( ref_audio, ref_text, gen_text_cleaned, f5tts_model, vocoder, cross_fade_duration=cross_fade_duration, nfe_step=nfe_step, speed=speed, progress=gr.Progress(), cfg_strength=cfg_strength, target_rms=0.1, sway_sampling_coef=-1, set_max_chars=max_chars, no_ref_audio=no_ref_audio ) # ลบ silence หากต้องการ if remove_silence: final_wave = self._remove_silence(final_wave, final_sample_rate) # บันทึก spectrogram spectrogram_path = self._save_spectrogram(combined_spectrogram) print("seed:", output_seed) return (final_sample_rate, final_wave), spectrogram_path, ref_text, output_seed def _remove_silence(self, wave, sample_rate): """ลบ silence จากเสียง""" with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: sf.write(f.name, wave, sample_rate) remove_silence_for_generated_wav(f.name) final_wave, _ = torchaudio.load(f.name) return final_wave.squeeze().cpu().numpy() def _save_spectrogram(self, spectrogram): """บันทึก spectrogram""" with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_spectrogram: spectrogram_path = tmp_spectrogram.name save_spectrogram(spectrogram, spectrogram_path) return spectrogram_path class SpeechToTextProcessor: """จัดการการประมวลผล Speech-to-Text""" @staticmethod def transcribe_text(input_audio="", translate=False, model="large-v3-turbo", compute_type="float16", target_lg="th", source_lg='th'): """ถอดข้อความจากเสียง""" if translate: output_text = translate_inference( text=transribe_inference( input_audio=input_audio, model=model, compute_type=compute_type, language=source_lg ), target=target_lg ) else: output_text = transribe_inference( input_audio=input_audio, model=model, compute_type=compute_type, language=source_lg ) return output_text