indonesian-tts / usage.py
datasetsANDmodels's picture
Upload usage.py
2f9d6f5 verified
raw
history blame contribute delete
714 Bytes
from transformers import VitsModel, AutoTokenizer
import torch
import scipy
import numpy as np
model = VitsModel.from_pretrained("indonesian-tts")
tokenizer = AutoTokenizer.from_pretrained("indonesian-tts")
text ="Saya menelepon dari kantor pajak."
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = model(**inputs).waveform
output_np = output.squeeze().numpy() # Remove extra dimensions
output_np = output_np / np.max(np.abs(output_np)) # Normalize to prevent clipping
scipy.io.wavfile.write(
"indonesian.wav", # Better file naming with padding
rate=model.config.sampling_rate,
data=output_np
)