Spaces:
Runtime error
Runtime error
| import librosa | |
| import numpy as np | |
| import torch | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import torch | |
| checkpoint = "microsoft/speecht5_tts" | |
| processor = SpeechT5Processor.from_pretrained(checkpoint) | |
| model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| def predict(text): | |
| if len(text.strip()) == 0: | |
| return (16000, np.zeros(0).astype(np.int16)) | |
| inputs = processor(text=text, return_tensors="pt") | |
| # limit input length | |
| input_ids = inputs["input_ids"] | |
| input_ids = input_ids[..., :model.config.max_text_positions] | |
| speaker_embedding = np.load("cmu_us_ksp_arctic-wav-arctic_b0087.npy") | |
| speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) | |
| speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) | |
| speech = (speech.numpy() * 32767).astype(np.int16) | |
| return (16000, speech) | |
| gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Text(label="Input Text"), | |
| gr.Radio(label="Speaker", choices=[ | |
| "KSP (male)" | |
| ], | |
| value="KSP (male)"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Generated Speech", type="numpy"), | |
| ] | |
| ).launch() |