import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "dispatchAI/SmolLM2-135M-Instruct-mobile"

tokenizer = None
model = None

def load_model():
    global tokenizer, model
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.float16,
            device_map="auto",
        )
    return tokenizer, model

@spaces.GPU
def chat(message, history):
    tokenizer, model = load_model()
    
    messages = [{"role": "system", "content": "You are a helpful assistant running on a mobile-optimized model."}]
    for h in history:
        messages.append({"role": "user", "content": h[0]})
        messages.append({"role": "assistant", "content": h[1]})
    messages.append({"role": "user", "content": message})
    
    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return response

demo = gr.ChatInterface(
    fn=chat,
    title="🚀 dispatchAI Mobile Chat",
    description="Chat with dispatchAI/SmolLM2-135M-Instruct-mobile — a 135M parameter model optimized for mobile devices. This runs on ZeroGPU.",
    theme=gr.themes.Soft(primary_hue="blue"),
)

if __name__ == "__main__":
    demo.launch()