| from transformers import VitsModel, AutoTokenizer | |
| import torch | |
| import numpy as np | |
| from scipy.io.wavfile import write | |
| model = VitsModel.from_pretrained("../urdu-tts") | |
| tokenizer = AutoTokenizer.from_pretrained("../urdu-tts") | |
| text = "ہیلو، آپ کیسے ہیں؟ میں ٹیکس آفس سے کال کر رہا ہوں۔" | |
| inputs = tokenizer(text, return_tensors="pt") | |
| inputs["input_ids"] = inputs["input_ids"].long() | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform | |
| output = output.squeeze() | |
| output_np = output.cpu().numpy() | |
| output_int16 = (output_np * 32767).astype(np.int16) | |
| write("urdu.wav", rate=model.config.sampling_rate, data=output_int16) | |