from transformers import VitsTokenizer, VitsModel import torch import soundfile as sf import gradio as gr model_id = "facebook/mms-tts-mhr" tokenizer = VitsTokenizer.from_pretrained(model_id) model = VitsModel.from_pretrained(model_id) def tts_mari(text): inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs) audio = output.waveform.squeeze().cpu().numpy() output_path = "output.wav" sf.write(output_path, audio, 16000) return output_path interface = gr.Interface( fn=tts_mari, inputs=gr.Textbox(label="Nhập văn bản Meadow Mari"), outputs=gr.Audio(type="filepath", label="Kết quả TTS"), title="Meadow Mari TTS - Powered by Facebook VITS" ) if __name__ == "__main__": interface.launch()