|
|
import gradio as gr |
|
|
import torch |
|
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
def load_model(): |
|
|
"""Load the text-to-speech model""" |
|
|
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") |
|
|
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") |
|
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
return processor, model, vocoder |
|
|
|
|
|
|
|
|
def text_to_speech(text, processor, model, vocoder): |
|
|
"""Convert text to speech using SpeechT5 model""" |
|
|
try: |
|
|
|
|
|
inputs = processor(text=text, return_tensors="pt") |
|
|
|
|
|
|
|
|
|
|
|
speaker_embeddings = torch.zeros((1, 512)) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
|
|
|
speech = model.generate_speech( |
|
|
inputs["input_ids"], |
|
|
speaker_embeddings=speaker_embeddings, |
|
|
vocoder=vocoder |
|
|
) |
|
|
|
|
|
|
|
|
speech = speech.cpu().numpy().squeeze() |
|
|
speech = speech / np.max(np.abs(speech)) * 0.8 |
|
|
|
|
|
return speech, 16000 |
|
|
except Exception as e: |
|
|
raise gr.Error(f"Error generating speech: {str(e)}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
print("Loading Microsoft SpeechT5 model...") |
|
|
processor, model, vocoder = load_model() |
|
|
print("Model loaded successfully!") |
|
|
|
|
|
def generate_speech(text): |
|
|
"""Generate speech from text""" |
|
|
if not text.strip(): |
|
|
return None, "Please enter some text to convert to speech." |
|
|
|
|
|
try: |
|
|
audio_data, sample_rate = text_to_speech(text, processor, model, vocoder) |
|
|
|
|
|
|
|
|
return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'" |
|
|
except Exception as e: |
|
|
return None, f"Error: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo: |
|
|
gr.Markdown(""" |
|
|
# π€ Microsoft SpeechT5 Text-to-Speech |
|
|
|
|
|
Convert your text to natural-sounding speech using the Microsoft SpeechT5 model. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox( |
|
|
label="Input Text", |
|
|
placeholder="Enter text you want to convert to speech...", |
|
|
lines=3, |
|
|
max_lines=10 |
|
|
) |
|
|
generate_btn = gr.Button("Generate Speech", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
audio_output = gr.Audio(label="Generated Speech", type="numpy") |
|
|
status_output = gr.Textbox(label="Status", interactive=False) |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
"Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!", |
|
|
"The quick brown fox jumps over the lazy dog.", |
|
|
"Artificial intelligence is transforming the way we interact with technology.", |
|
|
"δ»ε€©ε€©ζ°ηε₯½οΌιεεΊε»ζ£ζ₯γ" |
|
|
], |
|
|
inputs=text_input |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_speech, |
|
|
inputs=text_input, |
|
|
outputs=[audio_output, status_output] |
|
|
) |
|
|
|
|
|
text_input.submit( |
|
|
fn=generate_speech, |
|
|
inputs=text_input, |
|
|
outputs=[audio_output, status_output] |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = main() |
|
|
demo.launch(share=False) |