import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import gradio as gr import threading # --- 1. MODEL SETUP (CPU COMPATIBLE) --- MODEL_NAME = "Xerv-AI/MAXWELL" print("Loading model on CPU... this may take a few minutes.") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="cpu", torch_dtype=torch.float32 ) # --- 2. INFERENCE LOGIC --- def stream_maxwell(message, history): prompt = f"<|im_start|>system\nYou are Maxwell, a highly analytical STEM assistant. Keep your responses very direct and to the point. Wrap your internal thought process in tags.<|im_end|>\n" for user_msg, assistant_msg in history: prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" inputs = tokenizer([prompt], return_tensors="pt").to("cpu") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs = dict( **inputs, max_new_tokens=512, temperature=0.3, do_sample=True, streamer=streamer, ) thread = threading.Thread(target=model.generate, kwargs=gen_kwargs) thread.start() partial_text = "" for new_text in streamer: partial_text += new_text display_text = partial_text if "" in display_text: display_text = display_text.replace("", "\n\n
🔍 Internal Trace") if "" in display_text: display_text = display_text.replace("", "
\n\n") yield display_text # --- 3. UI DESIGN (Fixed for Gradio 4.0+) --- custom_css = """ footer {visibility: hidden !important;} .gradio-container {background-color: #121212 !important; color: white !important;} details { background: #1A1A1A; border-left: 2px solid #3b82f6; padding: 10px; margin: 10px 0; color: #A0A0A0; } summary { cursor: pointer; color: #5c94ff; font-weight: bold; } """ # Wrap ChatInterface in Blocks to apply the CSS with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue="blue", neutral_hue="zinc")) as demo: gr.ChatInterface( fn=stream_maxwell, title="M. (CPU Mode)", description="The computational throne is currently on backup power (CPU).", ) if __name__ == "__main__": demo.queue().launch()