| import time | |
| import torch | |
| from transformers import set_seed | |
| from transformers import VitsTokenizer, VitsModel | |
| def synthesize_facebook(s:str, iso3:str) -> str: | |
| ''' | |
| For given text, speak it. | |
| Parameters | |
| ---------- | |
| s: str | |
| The written text. | |
| is03:str | |
| The ISO-3 code of the text's language. | |
| Returns | |
| ---------- | |
| synth:str | |
| The synthesized audio. | |
| ''' | |
| # Ensure replicability | |
| set_seed(555) | |
| start_time = time.time() | |
| # Load synthesizer | |
| tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}") | |
| model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}") | |
| inputs = tokenizer(text=s, return_tensors="pt") | |
| # Inference | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| synth = outputs.waveform[0] | |
| print("Time elapsed: ", int(time.time() - start_time), " seconds") | |
| return synth.numpy() |