from transformers import VitsModel, AutoTokenizer import torch import scipy.io.wavfile as wavfile model = VitsModel.from_pretrained("spanish-tts") tokenizer = AutoTokenizer.from_pretrained("spanish-tts") text= "Estoy llamando desde la oficina de impuestos." inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform # Output is a tensor of shape [batch_size, waveform_length] output_audio = output.squeeze(0).numpy() # Remove batch dimension wavfile.write("spanish.wav", rate=model.config.sampling_rate, data=output_audio)