from transformers import VitsModel, AutoTokenizer import torch import scipy.io.wavfile as wavfile import numpy as np model = VitsModel.from_pretrained("male-vietnamese-tts") tokenizer = AutoTokenizer.from_pretrained("male-vietnamese-tts") text = "Xin chào, đây là một cuộc chạy thử nghiệm." inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform wav = output.squeeze().cpu().numpy() rate = model.config.sampling_rate output_file = "vietnamese.wav" wavfile.write(output_file, rate, wav) print(f"Audio saved to {output_file}")