import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_REPO = "E-motionAssistant/qwen-2.5-3b-tamil-therapy-merged"
TOKENIZER_REPO = "Qwen/Qwen2.5-3B-Instruct"

SYSTEM_PROMPT = "You are an empathetic Tamil therapist providing CBT-based support."

model = None
tokenizer = None

def load_model():
    global model, tokenizer
    if model is None:
        print(f"📥 Loading tokenizer from {TOKENIZER_REPO}...")
        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO, trust_remote_code=True)
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            
        print(f"📥 Loading model weights (Full Precision for CPU)...")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_REPO,
            torch_dtype=torch.float32,
            device_map="cpu",
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
        print("✅ Success: Tamil Therapy System is online on CPU!")

load_model()

def chat(message, history):
    if not message.strip(): 
        return ""
    
    try:
        # Build prompt using Qwen chat format
        prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
        
        # Include last 3 exchanges for context
        for user_msg, bot_msg in history[-3:]:
            prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n"
        
        prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
        
        # Tokenize and move to model device
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode only the new tokens
        input_len = inputs.input_ids.shape[1]
        response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
        
        return response.strip()
    
    except Exception as e:
        print(f"❌ Generation Error: {e}")
        return f"மன்னிக்கவும், பிழை ஏற்பட்டது: {str(e)}. மீண்டும் முயற்சிக்கவும்."

demo = gr.ChatInterface(
    fn=chat,
    title="💚 E.motion Tamil Therapy Assistant",
    description="*உங்கள் இரக்கமுள்ள AI துணை - Your compassionate AI companion for mental wellbeing in Tamil*\n\n**Note:** This is an AI assistant, not a replacement for professional therapy.",
    theme=gr.themes.Soft(),
    chatbot=gr.Chatbot(height=450),
)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0", 
        server_port=7860, 
        share=False, 
        show_api=False
    )