import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
LORA_REPO = "alxstuff/Lumen-7b-v2"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)

print("Loading LoRA adapter...")
model.load_adapter(LORA_REPO)
model.eval()
print("✅ Lumen ready!")

def chat(message, history):
    prompt = "<|im_start|>system\nYou are Lumen, an expert AI coding assistant built by TheAlxLabs. You write clean, efficient code and explain it clearly.<|im_end|>\n"
    for user, assistant in history:
        prompt += f"<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n{assistant}<|im_end|>\n"
    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    thread = Thread(target=model.generate, kwargs={
        **inputs,
        "streamer": streamer,
        "max_new_tokens": 1024,
        "temperature": 0.2,
        "do_sample": True,
    })
    thread.start()

    response = ""
    for token in streamer:
        response += token
        yield response

gr.ChatInterface(
    fn=chat,
    title="⚡ Lumen — AI Coding Assistant",
    description="Local-first AI coding assistant by TheAlxLabs.",
    examples=[
        "Write a Python function to reverse a linked list",
        "Explain what this does: `[x for x in range(10) if x % 2 == 0]`",
        "Fix this bug: TypeError: 'NoneType' object is not subscriptable"
    ],
).launch()