import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gradio as gr
import threading

# --- 1. MODEL SETUP (CPU COMPATIBLE) ---
MODEL_NAME = "Xerv-AI/MAXWELL"

print("Loading model on CPU... this may take a few minutes.")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="cpu",
    torch_dtype=torch.float32 
)

# --- 2. INFERENCE LOGIC ---
def stream_maxwell(message, history):
    prompt = f"<|im_start|>system\nYou are Maxwell, a highly analytical STEM assistant. Keep your responses very direct and to the point. Wrap your internal thought process in <reasoning> tags.<|im_end|>\n"
    
    for user_msg, assistant_msg in history:
        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
    
    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
    
    inputs = tokenizer([prompt], return_tensors="pt").to("cpu")
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    gen_kwargs = dict(
        **inputs,
        max_new_tokens=512,
        temperature=0.3,
        do_sample=True,
        streamer=streamer,
    )
    
    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()

    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        display_text = partial_text
        if "<reasoning>" in display_text:
            display_text = display_text.replace("<reasoning>", "\n\n<details><summary><b>🔍 Internal Trace</b></summary><i>")
        if "</reasoning>" in display_text:
            display_text = display_text.replace("</reasoning>", "</i></details>\n\n")
        yield display_text

# --- 3. UI DESIGN (Fixed for Gradio 4.0+) ---
custom_css = """
footer {visibility: hidden !important;}
.gradio-container {background-color: #121212 !important; color: white !important;}
details { background: #1A1A1A; border-left: 2px solid #3b82f6; padding: 10px; margin: 10px 0; color: #A0A0A0; }
summary { cursor: pointer; color: #5c94ff; font-weight: bold; }
"""

# Wrap ChatInterface in Blocks to apply the CSS
with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue="blue", neutral_hue="zinc")) as demo:
    gr.ChatInterface(
        fn=stream_maxwell,
        title="M. (CPU Mode)",
        description="The computational throne is currently on backup power (CPU).",
    )

if __name__ == "__main__":
    demo.queue().launch()