VoiceDemo / app.py
jerrybwang
11
ba70a88
import gradio as gr
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import numpy as np
# Load Microsoft SpeechT5 model
def load_model():
"""Load the text-to-speech model"""
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
return processor, model, vocoder
# Text-to-speech function
def text_to_speech(text, processor, model, vocoder):
"""Convert text to speech using SpeechT5 model"""
try:
# Process the input text
inputs = processor(text=text, return_tensors="pt")
# Create a simple default speaker embedding (zeros vector)
# This is a fallback when specific speaker embeddings are not available
speaker_embeddings = torch.zeros((1, 512)) # Standard speaker embedding size
# Generate speech using the correct method
with torch.no_grad():
# Generate audio directly using generate_speech with vocoder parameter
speech = model.generate_speech(
inputs["input_ids"],
speaker_embeddings=speaker_embeddings,
vocoder=vocoder
)
# Convert to numpy array and normalize
speech = speech.cpu().numpy().squeeze()
speech = speech / np.max(np.abs(speech)) * 0.8 # Normalize to prevent clipping
return speech, 16000 # Return audio data and sample rate
except Exception as e:
raise gr.Error(f"Error generating speech: {str(e)}")
# Main function
def main():
# Load model once at startup
print("Loading Microsoft SpeechT5 model...")
processor, model, vocoder = load_model()
print("Model loaded successfully!")
def generate_speech(text):
"""Generate speech from text"""
if not text.strip():
return None, "Please enter some text to convert to speech."
try:
audio_data, sample_rate = text_to_speech(text, processor, model, vocoder)
# Return audio file
return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
except Exception as e:
return None, f"Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo:
gr.Markdown("""
# 🎀 Microsoft SpeechT5 Text-to-Speech
Convert your text to natural-sounding speech using the Microsoft SpeechT5 model.
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter text you want to convert to speech...",
lines=3,
max_lines=10
)
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Speech", type="numpy")
status_output = gr.Textbox(label="Status", interactive=False)
# Examples
gr.Examples(
examples=[
"Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!",
"The quick brown fox jumps over the lazy dog.",
"Artificial intelligence is transforming the way we interact with technology.",
"δ»Šε€©ε€©ζ°”ηœŸε₯½οΌŒι€‚εˆε‡ΊεŽ»ζ•£ζ­₯。"
],
inputs=text_input
)
# Event handling
generate_btn.click(
fn=generate_speech,
inputs=text_input,
outputs=[audio_output, status_output]
)
text_input.submit(
fn=generate_speech,
inputs=text_input,
outputs=[audio_output, status_output]
)
return demo
if __name__ == "__main__":
demo = main()
demo.launch(share=False)