from transformers import VitsModel, AutoTokenizer import torch import numpy as np from scipy.io.wavfile import write model = VitsModel.from_pretrained("../arabic-tts") tokenizer = AutoTokenizer.from_pretrained("../arabic-tts") text = "يوفر مجتمع البناء قروضا عقارية وقروض وعقارية" inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform output = output.squeeze() output_np = output.cpu().numpy() output_int16 = (output_np * 32767).astype(np.int16) write("arabic.wav", rate=model.config.sampling_rate, data=output_int16)