File size: 4,121 Bytes
289115a
 
ea52dd2
289115a
 
ea52dd2
289115a
 
ea52dd2
 
 
 
289115a
 
ea52dd2
 
289115a
 
 
 
ba70a88
 
 
ea52dd2
c1ad000
289115a
c1ad000
 
 
 
 
 
289115a
 
 
 
 
ea52dd2
289115a
 
 
 
 
 
ea52dd2
 
289115a
 
 
 
 
 
 
 
ea52dd2
289115a
 
 
 
 
 
 
ea52dd2
289115a
ea52dd2
289115a
ea52dd2
289115a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea52dd2
289115a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import numpy as np

# Load Microsoft SpeechT5 model
def load_model():
    """Load the text-to-speech model"""
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    return processor, model, vocoder

# Text-to-speech function
def text_to_speech(text, processor, model, vocoder):
    """Convert text to speech using SpeechT5 model"""
    try:
        # Process the input text
        inputs = processor(text=text, return_tensors="pt")
        
        # Create a simple default speaker embedding (zeros vector)
        # This is a fallback when specific speaker embeddings are not available
        speaker_embeddings = torch.zeros((1, 512))  # Standard speaker embedding size
        
        # Generate speech using the correct method
        with torch.no_grad():
            # Generate audio directly using generate_speech with vocoder parameter
            speech = model.generate_speech(
                inputs["input_ids"], 
                speaker_embeddings=speaker_embeddings,
                vocoder=vocoder
            )
        
        # Convert to numpy array and normalize
        speech = speech.cpu().numpy().squeeze()
        speech = speech / np.max(np.abs(speech)) * 0.8  # Normalize to prevent clipping
        
        return speech, 16000  # Return audio data and sample rate
    except Exception as e:
        raise gr.Error(f"Error generating speech: {str(e)}")

# Main function
def main():
    # Load model once at startup
    print("Loading Microsoft SpeechT5 model...")
    processor, model, vocoder = load_model()
    print("Model loaded successfully!")
    
    def generate_speech(text):
        """Generate speech from text"""
        if not text.strip():
            return None, "Please enter some text to convert to speech."
        
        try:
            audio_data, sample_rate = text_to_speech(text, processor, model, vocoder)
            
            # Return audio file
            return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
        except Exception as e:
            return None, f"Error: {str(e)}"
    
    # Create Gradio interface
    with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo:
        gr.Markdown("""
        # 🎀 Microsoft SpeechT5 Text-to-Speech
        
        Convert your text to natural-sounding speech using the Microsoft SpeechT5 model.
        """)
        
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(
                    label="Input Text",
                    placeholder="Enter text you want to convert to speech...",
                    lines=3,
                    max_lines=10
                )
                generate_btn = gr.Button("Generate Speech", variant="primary")
                
            with gr.Column():
                audio_output = gr.Audio(label="Generated Speech", type="numpy")
                status_output = gr.Textbox(label="Status", interactive=False)
        
        # Examples
        gr.Examples(
            examples=[
                "Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!",
                "The quick brown fox jumps over the lazy dog.",
                "Artificial intelligence is transforming the way we interact with technology.",
                "δ»Šε€©ε€©ζ°”ηœŸε₯½οΌŒι€‚εˆε‡ΊεŽ»ζ•£ζ­₯。"
            ],
            inputs=text_input
        )
        
        # Event handling
        generate_btn.click(
            fn=generate_speech,
            inputs=text_input,
            outputs=[audio_output, status_output]
        )
        
        text_input.submit(
            fn=generate_speech,
            inputs=text_input,
            outputs=[audio_output, status_output]
        )
    
    return demo

if __name__ == "__main__":
    demo = main()
    demo.launch(share=False)