import gradio as gr
import torch
from transformers import pipeline
import whisper
from gtts import gTTS
import os
import time

# --- 1. CONFIGURATION ---
# Using the Ultra-Fast 0.5B model (Switch to "1.5B" if you want it smarter)
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"

print(f"⏳ Loading {MODEL_ID}...")

pipe = pipeline(
    "text-generation",
    model=MODEL_ID,
    model_kwargs={"low_cpu_mem_usage": True},
    device_map="cpu", 
)

whisper_model = whisper.load_model("tiny.en")

print("✅ Systems Ready.")

# --- 2. TEACHER PERSONA (UPDATED FOR GRAMMAR) ---
SYSTEM_PROMPT = """
You are a helpful English teacher.
1. If the user makes a grammar mistake, ALWAYS correct it first. Start with "Correction: [Correct Sentence]".
2. Then, answer the user's question or continue the chat.
3. Keep your own English simple (A2 level).
4. Keep responses short (max 2 sentences).
"""

# --- 3. PROCESSING FUNCTIONS ---

def text_to_speech(text):
    try:
        if not text: return None
        # Remove "Correction:" tag so the voice flows better
        # text = text.replace("Correction:", "You should say:") 
        
        # Clean up for TTS
        text_clean = text.replace("*", "").replace("#", "")
        
        tts = gTTS(text_clean, lang='en')
        filename = f"response_{int(time.time())}.mp3"
        tts.save(filename)
        return filename
    except:
        return None

def generate_response(message, history):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    
    # Keep history short for speed
    for user_msg, bot_msg in history[-2:]:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    
    messages.append({"role": "user", "content": message})

    outputs = pipe(
        messages,
        max_new_tokens=60, # Slightly increased to allow room for corrections
        do_sample=True,
        temperature=0.6, # Lower temperature = more accurate grammar
    )
    
    return outputs[0]["generated_text"][-1]["content"]

def conversation_logic(audio_path, text_input, history):
    user_text = ""

    # 1. Transcribe
    if audio_path:
        result = whisper_model.transcribe(audio_path)
        user_text = result["text"]
    elif text_input:
        user_text = text_input
    else:
        return history, None, ""

    if not user_text.strip():
        return history, None, ""

    # 2. Think
    ai_response = generate_response(user_text, history)

    # 3. Speak
    ai_audio = text_to_speech(ai_response)

    # 4. Update UI
    history.append((user_text, ai_response))
    return history, ai_audio, ""

# --- 4. UI SETUP ---

with gr.Blocks(title="Grammar Tutor") as demo:
    gr.Markdown(f"# ⚡ Grammar & Speaking Tutor")
    gr.Markdown("I will correct your mistakes and chat with you.")
    
    chatbot = gr.Chatbot(label="Conversation")
    
    with gr.Row():
        audio_in = gr.Audio(sources=["microphone"], type="filepath", label="🎤 Speak")
        text_in = gr.Textbox(label="⌨️ Type")
        
    submit_btn = gr.Button("Send", variant="primary")
    audio_out = gr.Audio(label="Teacher's Voice", autoplay=True)

    submit_btn.click(
        fn=conversation_logic,
        inputs=[audio_in, text_in, chatbot],
        outputs=[chatbot, audio_out, text_in]
    )

demo.launch()