# app.py
import os
import numpy as np
import gradio as gr
from transformers import pipeline
from langdetect import detect, LangDetectException
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import soundfile as sf
from datasets import load_dataset

# Initialize models only once
print("Loading ASR model...")
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    chunk_length_s=30
)

print("Loading grammar correction model...")
grammar_pipe = pipeline(
    "text2text-generation",
    model="pszemraj/flan-t5-large-grammar-synthesis"
)

print("Loading chat model...")
chat_pipe = pipeline(
    "text-generation",
    model="microsoft/DialoGPT-medium"
)

print("Loading TTS components...")
# Initialize TTS components
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

print("Loading speaker embeddings...")
# Load speaker embeddings for male/female voices
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = {
    "male": torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0),
    "female": torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
}

print("All models loaded successfully!")

#####################################################################
###def process_audio(audio_path, voice_choice, conversation_history):
###    """Process audio input and generate response"""
###    # Transcribe audio
###    try:
###        result = asr_pipe(audio_path)
###        user_input = result["text"]
###    except Exception as e:
###        print(f"ASR error: {e}")
###        return None, "Could not process audio. Please try again.", conversation_history
###
###    # Check if input is English
###    try:
###        if detect(user_input) != "en":
###            return user_input, "You must try to speak in English for me to respond", conversation_history
###    except LangDetectException:
###        return user_input, "Could not detect language. Please speak clearly.", conversation_history
###
###    # Grammar correction
###    corrected_input = grammar_pipe(user_input, max_length=256)[0]["generated_text"]
###
###    # Update conversation history
###    conversation_history.append(f"{corrected_input}")
###
###    # Generate conversational response
###    chat_input = "\n".join(conversation_history[-4:])  # Keep last 4 exchanges
###    response = chat_pipe(chat_input, max_length=256, pad_token_id=chat_pipe.tokenizer.eos_token_id)
###    response_text = response[0]["generated_text"].split("Teacher:")[-1].strip()
###
###    # Update conversation history
###    conversation_history.append(f"Teacher: {response_text}")
###
###    # Generate speech
###    inputs = tts_processor(text=response_text, return_tensors="pt")
###    speech = tts_model.generate_speech(
###        inputs["input_ids"],
###        speaker_embeddings[voice_choice],
###        vocoder=tts_vocoder
###    )
###
###    # Save audio output
###    output_audio = "response.wav"
###    sf.write(output_audio, speech.numpy(), samplerate=16000)
###
###    return user_input, response_text, output_audio, conversation_history
###########################################################################
def process_audio(audio_path, voice_choice, conversation_history):
    """Process audio input and generate response"""
    # Transcribe audio
    try:
        result = asr_pipe(audio_path)
        user_input = result["text"]
    except Exception as e:
        print(f"ASR error: {e}")
        # Return 4 values, including placeholders for the missing outputs
        return None, "Could not process audio. Please try again.", None, conversation_history

    # Check if input is English
    try:
        if detect(user_input) != "en":
            # Return 4 values
            return user_input, "You must try to speak in English for me to respond", None, conversation_history
    except LangDetectException:
        # Return 4 values
        return user_input, "Could not detect language. Please speak clearly.", None, conversation_history

    # Grammar correction
    corrected_input = grammar_pipe(user_input, max_length=256)[0]["generated_text"]

    # Update conversation history
    conversation_history.append(f"{corrected_input}")

    # Generate conversational response
    chat_input = "\n".join(conversation_history[-4:])  # Keep last 4 exchanges
    response = chat_pipe(chat_input, max_length=256, pad_token_id=chat_pipe.tokenizer.eos_token_id)
    response_text = response[0]["generated_text"].split("Teacher:")[-1].strip()

    # Update conversation history
    conversation_history.append(f"Teacher: {response_text}")

    # Generate speech
    inputs = tts_processor(text=response_text, return_tensors="pt")
    speech = tts_model.generate_speech(
        inputs["input_ids"],
        speaker_embeddings[voice_choice],
        vocoder=tts_vocoder
    )

    # Save audio output
    output_audio = "response.wav"
    sf.write(output_audio, speech.numpy(), samplerate=16000)

    # Return 4 values
    return user_input, response_text, output_audio, conversation_history

########################################################################
# Gradio interface
with gr.Blocks(title="Audio English Teacher") as demo:
    gr.Markdown("# 🎓 Audio English Teacher")
    gr.Markdown("Practice English conversation with AI correction and feedback!")

    with gr.Row():
        voice_choice = gr.Radio(
            ["male", "female"],
            label="Select Voice",
            value="female"
        )

    audio_input = gr.Audio(
        sources=["microphone"],
        type="filepath",
        label="Speak in English"
    )

    history_state = gr.State([])

    with gr.Column():
        original_text = gr.Textbox(label="What you said")
        corrected_output = gr.Textbox(label="Corrected English")
        audio_output = gr.Audio(label="Teacher's Response", autoplay=True)

    audio_input.stop_recording(
        fn=process_audio,
        inputs=[audio_input, voice_choice, history_state],
        outputs=[original_text, corrected_output, audio_output, history_state]
    )

    gr.Examples(
        examples=[
            ["I goes to school yesterday", "male"],
            ["She don't like apples", "female"],
            ["We was happy for the results", "male"]
        ],
        inputs=[original_text, voice_choice], # Changed inputs to match the function
        outputs=[original_text, corrected_output, audio_output, history_state],
        fn=process_audio
    )


if __name__ == "__main__":
    demo.launch() # No need for share=True on Hugging Face Spaces