from transformers import VitsModel, AutoTokenizer import torch import scipy import numpy as np model = VitsModel.from_pretrained("indonesian-tts") tokenizer = AutoTokenizer.from_pretrained("indonesian-tts") text ="Saya menelepon dari kantor pajak." inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform output_np = output.squeeze().numpy() # Remove extra dimensions output_np = output_np / np.max(np.abs(output_np)) # Normalize to prevent clipping scipy.io.wavfile.write( "indonesian.wav", # Better file naming with padding rate=model.config.sampling_rate, data=output_np )