File size: 714 Bytes
2f9d6f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from transformers import VitsModel, AutoTokenizer
import torch
import scipy
import numpy as np

model = VitsModel.from_pretrained("indonesian-tts")
tokenizer = AutoTokenizer.from_pretrained("indonesian-tts")
text ="Saya menelepon dari kantor pajak."
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
 output = model(**inputs).waveform
output_np = output.squeeze().numpy()  # Remove extra dimensions
output_np = output_np / np.max(np.abs(output_np))  # Normalize to prevent clipping
            
scipy.io.wavfile.write(
                "indonesian.wav",  # Better file naming with padding
                rate=model.config.sampling_rate,
                data=output_np
                )