File size: 4,121 Bytes
289115a ea52dd2 289115a ea52dd2 289115a ea52dd2 289115a ea52dd2 289115a ba70a88 ea52dd2 c1ad000 289115a c1ad000 289115a ea52dd2 289115a ea52dd2 289115a ea52dd2 289115a ea52dd2 289115a ea52dd2 289115a ea52dd2 289115a ea52dd2 289115a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | import gradio as gr
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import numpy as np
# Load Microsoft SpeechT5 model
def load_model():
"""Load the text-to-speech model"""
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
return processor, model, vocoder
# Text-to-speech function
def text_to_speech(text, processor, model, vocoder):
"""Convert text to speech using SpeechT5 model"""
try:
# Process the input text
inputs = processor(text=text, return_tensors="pt")
# Create a simple default speaker embedding (zeros vector)
# This is a fallback when specific speaker embeddings are not available
speaker_embeddings = torch.zeros((1, 512)) # Standard speaker embedding size
# Generate speech using the correct method
with torch.no_grad():
# Generate audio directly using generate_speech with vocoder parameter
speech = model.generate_speech(
inputs["input_ids"],
speaker_embeddings=speaker_embeddings,
vocoder=vocoder
)
# Convert to numpy array and normalize
speech = speech.cpu().numpy().squeeze()
speech = speech / np.max(np.abs(speech)) * 0.8 # Normalize to prevent clipping
return speech, 16000 # Return audio data and sample rate
except Exception as e:
raise gr.Error(f"Error generating speech: {str(e)}")
# Main function
def main():
# Load model once at startup
print("Loading Microsoft SpeechT5 model...")
processor, model, vocoder = load_model()
print("Model loaded successfully!")
def generate_speech(text):
"""Generate speech from text"""
if not text.strip():
return None, "Please enter some text to convert to speech."
try:
audio_data, sample_rate = text_to_speech(text, processor, model, vocoder)
# Return audio file
return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
except Exception as e:
return None, f"Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo:
gr.Markdown("""
# π€ Microsoft SpeechT5 Text-to-Speech
Convert your text to natural-sounding speech using the Microsoft SpeechT5 model.
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter text you want to convert to speech...",
lines=3,
max_lines=10
)
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Speech", type="numpy")
status_output = gr.Textbox(label="Status", interactive=False)
# Examples
gr.Examples(
examples=[
"Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!",
"The quick brown fox jumps over the lazy dog.",
"Artificial intelligence is transforming the way we interact with technology.",
"δ»ε€©ε€©ζ°ηε₯½οΌιεεΊε»ζ£ζ₯γ"
],
inputs=text_input
)
# Event handling
generate_btn.click(
fn=generate_speech,
inputs=text_input,
outputs=[audio_output, status_output]
)
text_input.submit(
fn=generate_speech,
inputs=text_input,
outputs=[audio_output, status_output]
)
return demo
if __name__ == "__main__":
demo = main()
demo.launch(share=False) |