import subprocess
import sys

# --- THE STABILIZER BLOCK ---
print("🛠️ Stabilizing environment and fixing Gradio compatibility...")
subprocess.check_call([
    sys.executable, "-m", "pip", "install", 
    "tokenizers==0.20.1", 
    "transformers==4.45.2", 
    "huggingface-hub==0.24.7", # THE FIX: Pinning this prevents the HfFolder ImportError
    "gradio==4.44.1"
])

import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_REPO = "E-motionAssistant/llama-3.2-3b-english-therapy-merged"
TOKENIZER_REPO = "unsloth/Llama-3.2-3B-Instruct" 
SYSTEM_PROMPT = "You are an empathetic therapist. Provide supportive, caring responses."

model = None
tokenizer = None

def load_model():
    global model, tokenizer
    if model is None:
        print(f"📥 Loading tokenizer from {TOKENIZER_REPO}...")
        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO)
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            
        print(f"📥 Loading model weights (Full Precision for CPU)...")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_REPO,
            # CHANGE: Use float32 because CPU doesn't support 'Half' (float16)
            torch_dtype=torch.float32, 
            device_map="cpu", # Explicitly target CPU
            low_cpu_mem_usage=True
        )
        print("✅ Success: System is online on CPU!")

load_model()

def chat(message, history):
    if not message.strip(): 
        return ""
    
    try:
        # Build prompt using Llama 3.2 Instruct format
        # This format helps the model understand it's a conversation
        prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_PROMPT}<|eot_id|>"
        for user_msg, bot_msg in history[-3:]:
            prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{bot_msg}<|eot_id|>"
        prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

        # Tokenize and move to the exact same device as the model
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                temperature=0.6, # Slightly lower for more stable therapy responses
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode only the new tokens
        input_len = inputs.input_ids.shape[1]
        response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
        
        return response.strip()

    except Exception as e:
        print(f"❌ Generation Error: {e}")
        return f"I'm sorry, I encountered an error: {str(e)}. Please try again."

demo = gr.ChatInterface(
    fn=chat,
    title="💚 E.motion Therapy Assistant",
    theme=gr.themes.Soft(),
    chatbot=gr.Chatbot(height=450),
)

if __name__ == "__main__":
    demo.launch()