Spaces:
Sleeping
Sleeping
| """ | |
| TTS Processor for F5-TTS Thai | |
| จัดการการประมวลผล text-to-speech | |
| """ | |
| import random | |
| import sys | |
| import tempfile | |
| import numpy as np | |
| import gradio as gr | |
| import soundfile as sf | |
| import torchaudio | |
| from f5_tts.infer.utils_infer import ( | |
| infer_process, | |
| preprocess_ref_audio_text, | |
| remove_silence_for_generated_wav, | |
| save_spectrogram, | |
| ) | |
| from f5_tts.model.utils import seed_everything | |
| from f5_tts.cleantext.number_tha import replace_numbers_with_thai | |
| from f5_tts.cleantext.th_repeat import process_thai_repeat | |
| from f5_tts.utils.whisper_api import translate_inference, transribe_inference | |
| from f5_tts.config import DEFAULT_TTS_SETTINGS | |
| class TTSProcessor: | |
| """จัดการการประมวลผล Text-to-Speech""" | |
| def __init__(self, model_manager): | |
| self.model_manager = model_manager | |
| def infer_tts(self, | |
| ref_audio_orig, | |
| ref_text, | |
| gen_text, | |
| remove_silence=DEFAULT_TTS_SETTINGS["remove_silence"], | |
| cross_fade_duration=DEFAULT_TTS_SETTINGS["cross_fade_duration"], | |
| nfe_step=DEFAULT_TTS_SETTINGS["nfe_step"], | |
| speed=DEFAULT_TTS_SETTINGS["speed"], | |
| cfg_strength=DEFAULT_TTS_SETTINGS["cfg_strength"], | |
| max_chars=DEFAULT_TTS_SETTINGS["max_chars"], | |
| seed=DEFAULT_TTS_SETTINGS["seed"], | |
| no_ref_audio=DEFAULT_TTS_SETTINGS["no_ref_audio"]): | |
| """ประมวลผล TTS""" | |
| # ตั้งค่า seed | |
| if seed == -1: | |
| seed = random.randint(0, sys.maxsize) | |
| seed_everything(seed) | |
| output_seed = seed | |
| # ตรวจสอบ input | |
| if not ref_audio_orig: | |
| gr.Warning("กรุณาใส่เสียงต้นฉบับ") | |
| return gr.update(), gr.update(), ref_text, output_seed | |
| if not gen_text.strip(): | |
| gr.Warning("กรุณาใส่ข้อความที่จะสร้าง") | |
| return gr.update(), gr.update(), ref_text, output_seed | |
| # เตรียมข้อมูล | |
| ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text) | |
| cross_fade_duration = float(cross_fade_duration) | |
| # ประมวลผลข้อความ | |
| gen_text_cleaned = process_thai_repeat(replace_numbers_with_thai(gen_text)) | |
| # ดึงโมเดล | |
| f5tts_model = self.model_manager.get_model() | |
| vocoder = self.model_manager.get_vocoder() | |
| # สร้างเสียง | |
| final_wave, final_sample_rate, combined_spectrogram = infer_process( | |
| ref_audio, | |
| ref_text, | |
| gen_text_cleaned, | |
| f5tts_model, | |
| vocoder, | |
| cross_fade_duration=cross_fade_duration, | |
| nfe_step=nfe_step, | |
| speed=speed, | |
| progress=gr.Progress(), | |
| cfg_strength=cfg_strength, | |
| target_rms=0.1, | |
| sway_sampling_coef=-1, | |
| set_max_chars=max_chars, | |
| no_ref_audio=no_ref_audio | |
| ) | |
| # ลบ silence หากต้องการ | |
| if remove_silence: | |
| final_wave = self._remove_silence(final_wave, final_sample_rate) | |
| # บันทึก spectrogram | |
| spectrogram_path = self._save_spectrogram(combined_spectrogram) | |
| print("seed:", output_seed) | |
| return (final_sample_rate, final_wave), spectrogram_path, ref_text, output_seed | |
| def _remove_silence(self, wave, sample_rate): | |
| """ลบ silence จากเสียง""" | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: | |
| sf.write(f.name, wave, sample_rate) | |
| remove_silence_for_generated_wav(f.name) | |
| final_wave, _ = torchaudio.load(f.name) | |
| return final_wave.squeeze().cpu().numpy() | |
| def _save_spectrogram(self, spectrogram): | |
| """บันทึก spectrogram""" | |
| with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_spectrogram: | |
| spectrogram_path = tmp_spectrogram.name | |
| save_spectrogram(spectrogram, spectrogram_path) | |
| return spectrogram_path | |
| class SpeechToTextProcessor: | |
| """จัดการการประมวลผล Speech-to-Text""" | |
| def transcribe_text(input_audio="", | |
| translate=False, | |
| model="large-v3-turbo", | |
| compute_type="float16", | |
| target_lg="th", | |
| source_lg='th'): | |
| """ถอดข้อความจากเสียง""" | |
| if translate: | |
| output_text = translate_inference( | |
| text=transribe_inference( | |
| input_audio=input_audio, | |
| model=model, | |
| compute_type=compute_type, | |
| language=source_lg | |
| ), | |
| target=target_lg | |
| ) | |
| else: | |
| output_text = transribe_inference( | |
| input_audio=input_audio, | |
| model=model, | |
| compute_type=compute_type, | |
| language=source_lg | |
| ) | |
| return output_text |