text-to-speech / app.py
T.Masuda
update app.py
b243a68
import gradio as gr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
#import soundfile as sf
from datasets import load_dataset
processor = SpeechT5Processor.from_pretrained('microsoft/speecht5_tts')
model = SpeechT5ForTextToSpeech.from_pretrained('microsoft/speecht5_tts')
vocoder = SpeechT5HifiGan.from_pretrained('microsoft/speecht5_hifigan')
def text_to_speech(text):
if text is None or text.strip() == '':
yield None
return
inputs = processor(text=text, return_tensors='pt')
embeddings_dataset = load_dataset('Matthijs/cmu-arctic-xvectors', split='validation')
speaker_embeddings = torch.tensor(embeddings_dataset[7306]['xvector']).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
#sf.write("speech.wav", speech.numpy(), samplerate=16000)
audio = (16000, speech.numpy())
yield audio
app = gr.Interface(
title='Text To Speech',
fn=text_to_speech,
inputs=gr.Textbox(label='text(english)'),
outputs=gr.Audio(),
flagging_mode='never',
concurrency_limit=20
)
app.launch()