|
|
from transformers import VitsModel, AutoTokenizer |
|
|
import torch |
|
|
import scipy |
|
|
import numpy as np |
|
|
|
|
|
model = VitsModel.from_pretrained("indonesian-tts") |
|
|
tokenizer = AutoTokenizer.from_pretrained("indonesian-tts") |
|
|
text ="Saya menelepon dari kantor pajak." |
|
|
inputs = tokenizer(text, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
|
output = model(**inputs).waveform |
|
|
output_np = output.squeeze().numpy() |
|
|
output_np = output_np / np.max(np.abs(output_np)) |
|
|
|
|
|
scipy.io.wavfile.write( |
|
|
"indonesian.wav", |
|
|
rate=model.config.sampling_rate, |
|
|
data=output_np |
|
|
) |
|
|
|
|
|
|