| from transformers import VitsModel, AutoTokenizer | |
| import torch | |
| import scipy.io.wavfile as wavfile | |
| model = VitsModel.from_pretrained("spanish-tts") | |
| tokenizer = AutoTokenizer.from_pretrained("spanish-tts") | |
| text= "Estoy llamando desde la oficina de impuestos." | |
| inputs = tokenizer(text, return_tensors="pt") | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform # Output is a tensor of shape [batch_size, waveform_length] | |
| output_audio = output.squeeze(0).numpy() # Remove batch dimension | |
| wavfile.write("spanish.wav", rate=model.config.sampling_rate, data=output_audio) | |