from transformers import AutoProcessor, VitsModel import torch import gradio as gr import scipy.io.wavfile model_id = "facebook/mms-tts-bod" processor = AutoProcessor.from_pretrained(model_id) model = VitsModel.from_pretrained(model_id) model.eval() def tts_fn(text): inputs = processor(text=text, return_tensors="pt") with torch.no_grad(): output = model(**inputs) audio = output.waveform.squeeze().numpy() sample_rate = model.config.sampling_rate return (sample_rate, audio) demo = gr.Interface(fn=tts_fn, inputs=gr.Textbox(label="Nhập văn bản tiếng Tây Tạng"), outputs="audio") demo.launch()