| from transformers import AutoProcessor, VitsModel | |
| import torch | |
| import gradio as gr | |
| import scipy.io.wavfile | |
| model_id = "facebook/mms-tts-bod" | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = VitsModel.from_pretrained(model_id) | |
| model.eval() | |
| def tts_fn(text): | |
| inputs = processor(text=text, return_tensors="pt") | |
| with torch.no_grad(): | |
| output = model(**inputs) | |
| audio = output.waveform.squeeze().numpy() | |
| sample_rate = model.config.sampling_rate | |
| return (sample_rate, audio) | |
| demo = gr.Interface(fn=tts_fn, inputs=gr.Textbox(label="Nhập văn bản tiếng Tây Tạng"), outputs="audio") | |
| demo.launch() |