Spaces:
Running
Running
File size: 5,398 Bytes
106478e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
"""
TTS Processor for F5-TTS Thai
จัดการการประมวลผล text-to-speech
"""
import random
import sys
import tempfile
import numpy as np
import gradio as gr
import soundfile as sf
import torchaudio
from f5_tts.infer.utils_infer import (
infer_process,
preprocess_ref_audio_text,
remove_silence_for_generated_wav,
save_spectrogram,
)
from f5_tts.model.utils import seed_everything
from f5_tts.cleantext.number_tha import replace_numbers_with_thai
from f5_tts.cleantext.th_repeat import process_thai_repeat
from f5_tts.utils.whisper_api import translate_inference, transribe_inference
from f5_tts.config import DEFAULT_TTS_SETTINGS
class TTSProcessor:
"""จัดการการประมวลผล Text-to-Speech"""
def __init__(self, model_manager):
self.model_manager = model_manager
def infer_tts(self,
ref_audio_orig,
ref_text,
gen_text,
remove_silence=DEFAULT_TTS_SETTINGS["remove_silence"],
cross_fade_duration=DEFAULT_TTS_SETTINGS["cross_fade_duration"],
nfe_step=DEFAULT_TTS_SETTINGS["nfe_step"],
speed=DEFAULT_TTS_SETTINGS["speed"],
cfg_strength=DEFAULT_TTS_SETTINGS["cfg_strength"],
max_chars=DEFAULT_TTS_SETTINGS["max_chars"],
seed=DEFAULT_TTS_SETTINGS["seed"],
no_ref_audio=DEFAULT_TTS_SETTINGS["no_ref_audio"]):
"""ประมวลผล TTS"""
# ตั้งค่า seed
if seed == -1:
seed = random.randint(0, sys.maxsize)
seed_everything(seed)
output_seed = seed
# ตรวจสอบ input
if not ref_audio_orig:
gr.Warning("กรุณาใส่เสียงต้นฉบับ")
return gr.update(), gr.update(), ref_text, output_seed
if not gen_text.strip():
gr.Warning("กรุณาใส่ข้อความที่จะสร้าง")
return gr.update(), gr.update(), ref_text, output_seed
# เตรียมข้อมูล
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text)
cross_fade_duration = float(cross_fade_duration)
# ประมวลผลข้อความ
gen_text_cleaned = process_thai_repeat(replace_numbers_with_thai(gen_text))
# ดึงโมเดล
f5tts_model = self.model_manager.get_model()
vocoder = self.model_manager.get_vocoder()
# สร้างเสียง
final_wave, final_sample_rate, combined_spectrogram = infer_process(
ref_audio,
ref_text,
gen_text_cleaned,
f5tts_model,
vocoder,
cross_fade_duration=cross_fade_duration,
nfe_step=nfe_step,
speed=speed,
progress=gr.Progress(),
cfg_strength=cfg_strength,
target_rms=0.1,
sway_sampling_coef=-1,
set_max_chars=max_chars,
no_ref_audio=no_ref_audio
)
# ลบ silence หากต้องการ
if remove_silence:
final_wave = self._remove_silence(final_wave, final_sample_rate)
# บันทึก spectrogram
spectrogram_path = self._save_spectrogram(combined_spectrogram)
print("seed:", output_seed)
return (final_sample_rate, final_wave), spectrogram_path, ref_text, output_seed
def _remove_silence(self, wave, sample_rate):
"""ลบ silence จากเสียง"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
sf.write(f.name, wave, sample_rate)
remove_silence_for_generated_wav(f.name)
final_wave, _ = torchaudio.load(f.name)
return final_wave.squeeze().cpu().numpy()
def _save_spectrogram(self, spectrogram):
"""บันทึก spectrogram"""
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_spectrogram:
spectrogram_path = tmp_spectrogram.name
save_spectrogram(spectrogram, spectrogram_path)
return spectrogram_path
class SpeechToTextProcessor:
"""จัดการการประมวลผล Speech-to-Text"""
@staticmethod
def transcribe_text(input_audio="",
translate=False,
model="large-v3-turbo",
compute_type="float16",
target_lg="th",
source_lg='th'):
"""ถอดข้อความจากเสียง"""
if translate:
output_text = translate_inference(
text=transribe_inference(
input_audio=input_audio,
model=model,
compute_type=compute_type,
language=source_lg
),
target=target_lg
)
else:
output_text = transribe_inference(
input_audio=input_audio,
model=model,
compute_type=compute_type,
language=source_lg
)
return output_text |