import tempfile import gradio as gr import soundfile as sf from transformers import ( AutoTokenizer, VitsModel ) import torch MODEL_NAME = "facebook/mms-tts-nan" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = VitsModel.from_pretrained(MODEL_NAME) def tts(text): inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs) waveform = output.waveform.squeeze().cpu().numpy() file = tempfile.NamedTemporaryFile( suffix=".wav", delete=False ) sf.write( file.name, waveform, model.config.sampling_rate ) return file.name demo = gr.Interface( fn=tts, inputs=gr.Textbox(label="輸入文字"), outputs=gr.Audio(type="filepath"), title="Taiwanese TTS" ) demo.launch()