| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| import gradio as gr |
| import threading |
|
|
| |
| MODEL_NAME = "Xerv-AI/MAXWELL" |
|
|
| print("Loading model on CPU... this may take a few minutes.") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_NAME, |
| device_map="cpu", |
| torch_dtype=torch.float32 |
| ) |
|
|
| |
| def stream_maxwell(message, history): |
| prompt = f"<|im_start|>system\nYou are Maxwell, a highly analytical STEM assistant. Keep your responses very direct and to the point. Wrap your internal thought process in <reasoning> tags.<|im_end|>\n" |
| |
| for user_msg, assistant_msg in history: |
| prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n" |
| |
| prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" |
| |
| inputs = tokenizer([prompt], return_tensors="pt").to("cpu") |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
| |
| gen_kwargs = dict( |
| **inputs, |
| max_new_tokens=512, |
| temperature=0.3, |
| do_sample=True, |
| streamer=streamer, |
| ) |
| |
| thread = threading.Thread(target=model.generate, kwargs=gen_kwargs) |
| thread.start() |
|
|
| partial_text = "" |
| for new_text in streamer: |
| partial_text += new_text |
| display_text = partial_text |
| if "<reasoning>" in display_text: |
| display_text = display_text.replace("<reasoning>", "\n\n<details><summary><b>🔍 Internal Trace</b></summary><i>") |
| if "</reasoning>" in display_text: |
| display_text = display_text.replace("</reasoning>", "</i></details>\n\n") |
| yield display_text |
|
|
| |
| custom_css = """ |
| footer {visibility: hidden !important;} |
| .gradio-container {background-color: #121212 !important; color: white !important;} |
| details { background: #1A1A1A; border-left: 2px solid #3b82f6; padding: 10px; margin: 10px 0; color: #A0A0A0; } |
| summary { cursor: pointer; color: #5c94ff; font-weight: bold; } |
| """ |
|
|
| |
| with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue="blue", neutral_hue="zinc")) as demo: |
| gr.ChatInterface( |
| fn=stream_maxwell, |
| title="M. (CPU Mode)", |
| description="The computational throne is currently on backup power (CPU).", |
| ) |
|
|
| if __name__ == "__main__": |
| demo.queue().launch() |