| from transformers import VitsModel, AutoTokenizer | |
| import torch | |
| import scipy.io.wavfile as wavfile | |
| import numpy as np | |
| model = VitsModel.from_pretrained("male-vietnamese-tts") | |
| tokenizer = AutoTokenizer.from_pretrained("male-vietnamese-tts") | |
| text = "Xin chào, đây là một cuộc chạy thử nghiệm." | |
| inputs = tokenizer(text, return_tensors="pt") | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform | |
| wav = output.squeeze().cpu().numpy() | |
| rate = model.config.sampling_rate | |
| output_file = "vietnamese.wav" | |
| wavfile.write(output_file, rate, wav) | |
| print(f"Audio saved to {output_file}") | |