F5-TTS-THAI / src /f5_tts /tts_processor.py
pythonlearnreal's picture
Upload folder using huggingface_hub
106478e verified
"""
TTS Processor for F5-TTS Thai
จัดการการประมวลผล text-to-speech
"""
import random
import sys
import tempfile
import numpy as np
import gradio as gr
import soundfile as sf
import torchaudio
from f5_tts.infer.utils_infer import (
infer_process,
preprocess_ref_audio_text,
remove_silence_for_generated_wav,
save_spectrogram,
)
from f5_tts.model.utils import seed_everything
from f5_tts.cleantext.number_tha import replace_numbers_with_thai
from f5_tts.cleantext.th_repeat import process_thai_repeat
from f5_tts.utils.whisper_api import translate_inference, transribe_inference
from f5_tts.config import DEFAULT_TTS_SETTINGS
class TTSProcessor:
"""จัดการการประมวลผล Text-to-Speech"""
def __init__(self, model_manager):
self.model_manager = model_manager
def infer_tts(self,
ref_audio_orig,
ref_text,
gen_text,
remove_silence=DEFAULT_TTS_SETTINGS["remove_silence"],
cross_fade_duration=DEFAULT_TTS_SETTINGS["cross_fade_duration"],
nfe_step=DEFAULT_TTS_SETTINGS["nfe_step"],
speed=DEFAULT_TTS_SETTINGS["speed"],
cfg_strength=DEFAULT_TTS_SETTINGS["cfg_strength"],
max_chars=DEFAULT_TTS_SETTINGS["max_chars"],
seed=DEFAULT_TTS_SETTINGS["seed"],
no_ref_audio=DEFAULT_TTS_SETTINGS["no_ref_audio"]):
"""ประมวลผล TTS"""
# ตั้งค่า seed
if seed == -1:
seed = random.randint(0, sys.maxsize)
seed_everything(seed)
output_seed = seed
# ตรวจสอบ input
if not ref_audio_orig:
gr.Warning("กรุณาใส่เสียงต้นฉบับ")
return gr.update(), gr.update(), ref_text, output_seed
if not gen_text.strip():
gr.Warning("กรุณาใส่ข้อความที่จะสร้าง")
return gr.update(), gr.update(), ref_text, output_seed
# เตรียมข้อมูล
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text)
cross_fade_duration = float(cross_fade_duration)
# ประมวลผลข้อความ
gen_text_cleaned = process_thai_repeat(replace_numbers_with_thai(gen_text))
# ดึงโมเดล
f5tts_model = self.model_manager.get_model()
vocoder = self.model_manager.get_vocoder()
# สร้างเสียง
final_wave, final_sample_rate, combined_spectrogram = infer_process(
ref_audio,
ref_text,
gen_text_cleaned,
f5tts_model,
vocoder,
cross_fade_duration=cross_fade_duration,
nfe_step=nfe_step,
speed=speed,
progress=gr.Progress(),
cfg_strength=cfg_strength,
target_rms=0.1,
sway_sampling_coef=-1,
set_max_chars=max_chars,
no_ref_audio=no_ref_audio
)
# ลบ silence หากต้องการ
if remove_silence:
final_wave = self._remove_silence(final_wave, final_sample_rate)
# บันทึก spectrogram
spectrogram_path = self._save_spectrogram(combined_spectrogram)
print("seed:", output_seed)
return (final_sample_rate, final_wave), spectrogram_path, ref_text, output_seed
def _remove_silence(self, wave, sample_rate):
"""ลบ silence จากเสียง"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
sf.write(f.name, wave, sample_rate)
remove_silence_for_generated_wav(f.name)
final_wave, _ = torchaudio.load(f.name)
return final_wave.squeeze().cpu().numpy()
def _save_spectrogram(self, spectrogram):
"""บันทึก spectrogram"""
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_spectrogram:
spectrogram_path = tmp_spectrogram.name
save_spectrogram(spectrogram, spectrogram_path)
return spectrogram_path
class SpeechToTextProcessor:
"""จัดการการประมวลผล Speech-to-Text"""
@staticmethod
def transcribe_text(input_audio="",
translate=False,
model="large-v3-turbo",
compute_type="float16",
target_lg="th",
source_lg='th'):
"""ถอดข้อความจากเสียง"""
if translate:
output_text = translate_inference(
text=transribe_inference(
input_audio=input_audio,
model=model,
compute_type=compute_type,
language=source_lg
),
target=target_lg
)
else:
output_text = transribe_inference(
input_audio=input_audio,
model=model,
compute_type=compute_type,
language=source_lg
)
return output_text