import gradio as gr
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import numpy as np

# Load Microsoft SpeechT5 model
def load_model():
    """Load the text-to-speech model"""
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    return processor, model, vocoder

# Text-to-speech function
def text_to_speech(text, processor, model, vocoder):
    """Convert text to speech using SpeechT5 model"""
    try:
        # Process the input text
        inputs = processor(text=text, return_tensors="pt")
        
        # Create a simple default speaker embedding (zeros vector)
        # This is a fallback when specific speaker embeddings are not available
        speaker_embeddings = torch.zeros((1, 512))  # Standard speaker embedding size
        
        # Generate speech using the correct method
        with torch.no_grad():
            # Generate audio directly using generate_speech with vocoder parameter
            speech = model.generate_speech(
                inputs["input_ids"], 
                speaker_embeddings=speaker_embeddings,
                vocoder=vocoder
            )
        
        # Convert to numpy array and normalize
        speech = speech.cpu().numpy().squeeze()
        speech = speech / np.max(np.abs(speech)) * 0.8  # Normalize to prevent clipping
        
        return speech, 16000  # Return audio data and sample rate
    except Exception as e:
        raise gr.Error(f"Error generating speech: {str(e)}")

# Main function
def main():
    # Load model once at startup
    print("Loading Microsoft SpeechT5 model...")
    processor, model, vocoder = load_model()
    print("Model loaded successfully!")
    
    def generate_speech(text):
        """Generate speech from text"""
        if not text.strip():
            return None, "Please enter some text to convert to speech."
        
        try:
            audio_data, sample_rate = text_to_speech(text, processor, model, vocoder)
            
            # Return audio file
            return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
        except Exception as e:
            return None, f"Error: {str(e)}"
    
    # Create Gradio interface
    with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo:
        gr.Markdown("""
        # 🎤 Microsoft SpeechT5 Text-to-Speech
        
        Convert your text to natural-sounding speech using the Microsoft SpeechT5 model.
        """)
        
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(
                    label="Input Text",
                    placeholder="Enter text you want to convert to speech...",
                    lines=3,
                    max_lines=10
                )
                generate_btn = gr.Button("Generate Speech", variant="primary")
                
            with gr.Column():
                audio_output = gr.Audio(label="Generated Speech", type="numpy")
                status_output = gr.Textbox(label="Status", interactive=False)
        
        # Examples
        gr.Examples(
            examples=[
                "Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!",
                "The quick brown fox jumps over the lazy dog.",
                "Artificial intelligence is transforming the way we interact with technology.",
                "今天天气真好，适合出去散步。"
            ],
            inputs=text_input
        )
        
        # Event handling
        generate_btn.click(
            fn=generate_speech,
            inputs=text_input,
            outputs=[audio_output, status_output]
        )
        
        text_input.submit(
            fn=generate_speech,
            inputs=text_input,
            outputs=[audio_output, status_output]
        )
    
    return demo

if __name__ == "__main__":
    demo = main()
    demo.launch(share=False)