import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ✅ Use a model that works on CPU
model_id = "microsoft/phi-2"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.float32, device_map="auto"
)

# Create generation pipeline
generator = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, do_sample=True, temperature=0.7
)


# ✅ Correct format: return dict with response key
def chat_fn(message, history):
    # Optional: recreate full prompt from history
    prompt = ""
    for turn in history:
        prompt += f"<|user|>\n{turn['content']}\n<|assistant|>\n{turn['response']}\n"
    prompt += f"<|user|>\n{message}\n<|assistant|>\n"

    output = generator(prompt, max_new_tokens=256)[0]["generated_text"]
    reply = output.replace(prompt, "").strip()

    return {"response": reply}


# ✅ Gradio app: ChatInterface uses type="messages" by default
chatbot_ui = gr.ChatInterface(
    fn=chat_fn,
    title="Phi-2 Chatbot",
    theme="default",
)

if __name__ == "__main__":
    chatbot_ui.launch()