import gradio as gr
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech
import torch
import numpy as np

# Speech → Text
stt = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base"
)

# LLM
llm = pipeline(
    "text-generation",
    model="distilgpt2"
)

# Text → Speech
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

# simple default speaker embedding
speaker_embeddings = torch.randn(1, 512)


def voice_assistant(audio):
    if audio is None:
        return "No audio", "No audio", None
    
    # 1. Unpack the tuple
    sr, y = audio 
    
    # 2. Convert to float32 (Whisper requirement)
    y = y.astype(np.float32)
    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1

    # 3. Speech → Text
    speech_text = stt(y)["text"]

    # AI response
    response = llm(
        speech_text,
        max_new_tokens=60
    )[0]["generated_text"]

    # Prepare text for TTS
    inputs = processor(text=response, return_tensors="pt")

    speech = tts_model.generate_speech(
        inputs["input_ids"],
        speaker_embeddings
    )

    audio_output = speech.cpu().numpy()

    # 1. Normalize the volume (so it's not too quiet)
    audio_output = np.clip(audio_output, -1.0, 1.0) # Prevent clipping
    
    # 2. Scale to 16-bit PCM (Required for most players)
    audio_output = (audio_output * 32767).astype(np.int16)

    # 3. SpeechT5 outputs at 16000Hz
    return speech_text, response, (16000, audio_output)


iface = gr.Interface(
    fn=voice_assistant,
    inputs=gr.Audio(
        sources=["microphone"],
        type="numpy",
        label="Speak here"
    ),
    outputs=[
        gr.Textbox(label="Recognized Speech"),
        gr.Textbox(label="AI Response"),
        gr.Audio(label="Voice Reply")
    ],
    title="Voice AI Assistant",
    description="Speak and the assistant will respond with voice"
)

iface.launch(server_name="0.0.0.0", server_port=7860)