| | import gradio as gr |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | |
| |
|
| |
|
| | import librosa |
| | import numpy as np |
| | import torch |
| |
|
| | from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
| |
|
| |
|
| | checkpoint = "microsoft/speecht5_tts" |
| | processor = SpeechT5Processor.from_pretrained(checkpoint) |
| | model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) |
| | vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
| |
|
| | def predict(text): |
| | if len(text.strip()) == 0: |
| | return (16000, np.zeros(0).astype(np.int16)) |
| |
|
| | inputs = processor(text=text, return_tensors="pt") |
| |
|
| | |
| | input_ids = inputs["input_ids"] |
| | input_ids = input_ids[..., :model.config.max_text_positions] |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | speaker_embedding = np.load("cmu_us_bdl_arctic-wav-arctic_a0009.npy") |
| |
|
| | speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) |
| |
|
| | speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) |
| |
|
| | speech = (speech.numpy() * 32767).astype(np.int16) |
| | return (16000, speech) |
| |
|
| | demo = gr.Interface( |
| | fn = predict, |
| | inputs="text", |
| | outputs=gr.Audio(type="numpy") |
| | ) |
| |
|
| | demo.launch() |