import gradio as gr import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import numpy as np # Load Microsoft SpeechT5 model def load_model(): """Load the text-to-speech model""" processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") return processor, model, vocoder # Text-to-speech function def text_to_speech(text, processor, model, vocoder): """Convert text to speech using SpeechT5 model""" try: # Process the input text inputs = processor(text=text, return_tensors="pt") # Create a simple default speaker embedding (zeros vector) # This is a fallback when specific speaker embeddings are not available speaker_embeddings = torch.zeros((1, 512)) # Standard speaker embedding size # Generate speech using the correct method with torch.no_grad(): # Generate audio directly using generate_speech with vocoder parameter speech = model.generate_speech( inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder ) # Convert to numpy array and normalize speech = speech.cpu().numpy().squeeze() speech = speech / np.max(np.abs(speech)) * 0.8 # Normalize to prevent clipping return speech, 16000 # Return audio data and sample rate except Exception as e: raise gr.Error(f"Error generating speech: {str(e)}") # Main function def main(): # Load model once at startup print("Loading Microsoft SpeechT5 model...") processor, model, vocoder = load_model() print("Model loaded successfully!") def generate_speech(text): """Generate speech from text""" if not text.strip(): return None, "Please enter some text to convert to speech." try: audio_data, sample_rate = text_to_speech(text, processor, model, vocoder) # Return audio file return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'" except Exception as e: return None, f"Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo: gr.Markdown(""" # 🎤 Microsoft SpeechT5 Text-to-Speech Convert your text to natural-sounding speech using the Microsoft SpeechT5 model. """) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Input Text", placeholder="Enter text you want to convert to speech...", lines=3, max_lines=10 ) generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Generated Speech", type="numpy") status_output = gr.Textbox(label="Status", interactive=False) # Examples gr.Examples( examples=[ "Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!", "The quick brown fox jumps over the lazy dog.", "Artificial intelligence is transforming the way we interact with technology.", "今天天气真好,适合出去散步。" ], inputs=text_input ) # Event handling generate_btn.click( fn=generate_speech, inputs=text_input, outputs=[audio_output, status_output] ) text_input.submit( fn=generate_speech, inputs=text_input, outputs=[audio_output, status_output] ) return demo if __name__ == "__main__": demo = main() demo.launch(share=False)