Update app.py
Browse files
app.py
CHANGED
|
@@ -1,97 +1,66 @@
|
|
| 1 |
-
import sys
|
| 2 |
import torch
|
| 3 |
-
from transformers import TextIteratorStreamer
|
| 4 |
-
from unsloth import FastLanguageModel
|
| 5 |
import gradio as gr
|
| 6 |
import threading
|
| 7 |
|
| 8 |
-
# --- 1.
|
| 9 |
-
# These patches fix common initialization errors in specific environments
|
| 10 |
-
try:
|
| 11 |
-
import unsloth_zoo.rl_replacements
|
| 12 |
-
for func in ['_unsloth_get_mm_token_id', '_unsloth_fix_mm_token_type_ids']:
|
| 13 |
-
if not hasattr(unsloth_zoo.rl_replacements, func):
|
| 14 |
-
setattr(unsloth_zoo.rl_replacements, func, lambda *args, **kwargs: None)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
| 17 |
-
|
| 18 |
-
# --- 2. MODEL SETUP (Optimized for HF T4 GPUs) ---
|
| 19 |
MODEL_NAME = "Xerv-AI/MAXWELL"
|
| 20 |
-
MAX_CONTEXT = 4096
|
| 21 |
|
| 22 |
-
model
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
| 28 |
)
|
| 29 |
|
| 30 |
-
# ---
|
| 31 |
def stream_maxwell(message, history):
|
| 32 |
-
|
| 33 |
-
prompt = "<|im_start|>system\nYou are Maxwell, a highly analytical STEM assistant. Keep your responses very direct and to the point. Wrap your internal thought process in <reasoning> tags.<|im_end|>\n"
|
| 34 |
|
| 35 |
for user_msg, assistant_msg in history:
|
| 36 |
prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
|
| 37 |
|
| 38 |
prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
|
| 39 |
|
| 40 |
-
inputs = tokenizer([prompt], return_tensors="pt").to("
|
| 41 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 42 |
|
| 43 |
gen_kwargs = dict(
|
| 44 |
**inputs,
|
| 45 |
-
max_new_tokens=
|
| 46 |
temperature=0.3,
|
| 47 |
do_sample=True,
|
| 48 |
streamer=streamer,
|
| 49 |
-
pad_token_id=tokenizer.eos_token_id
|
| 50 |
)
|
| 51 |
|
| 52 |
-
# Run generation in a separate thread to allow Gradio to stream tokens
|
| 53 |
thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
|
| 54 |
thread.start()
|
| 55 |
|
| 56 |
partial_text = ""
|
| 57 |
for new_text in streamer:
|
| 58 |
partial_text += new_text
|
| 59 |
-
|
| 60 |
-
# UI Transformation: Wrap <reasoning> tags in a Gradio-friendly style
|
| 61 |
-
# Note: Gradio markdown supports HTML for collapsible sections
|
| 62 |
display_text = partial_text
|
| 63 |
if "<reasoning>" in display_text:
|
| 64 |
display_text = display_text.replace("<reasoning>", "\n\n<details><summary><b>🔍 Internal Trace</b></summary><i>")
|
| 65 |
if "</reasoning>" in display_text:
|
| 66 |
display_text = display_text.replace("</reasoning>", "</i></details>\n\n")
|
| 67 |
-
|
| 68 |
yield display_text
|
| 69 |
|
| 70 |
-
# ---
|
| 71 |
custom_css = """
|
| 72 |
footer {visibility: hidden}
|
| 73 |
.gradio-container {background-color: #121212 !important; color: white !important;}
|
| 74 |
-
|
| 75 |
-
.message-bot {background-color: transparent !important; font-size: 1.1em !important;}
|
| 76 |
-
details {
|
| 77 |
-
background: #1A1A1A;
|
| 78 |
-
border-left: 2px solid #3b82f6;
|
| 79 |
-
padding: 10px;
|
| 80 |
-
border-radius: 0 8px 8px 0;
|
| 81 |
-
margin: 10px 0;
|
| 82 |
-
color: #A0A0A0;
|
| 83 |
-
}
|
| 84 |
-
summary { cursor: pointer; color: #5c94ff; font-weight: bold; }
|
| 85 |
"""
|
| 86 |
|
| 87 |
demo = gr.ChatInterface(
|
| 88 |
fn=stream_maxwell,
|
| 89 |
-
title="M.",
|
| 90 |
-
description="
|
| 91 |
-
theme=gr.themes.Default(primary_hue="blue", neutral_hue="zinc"),
|
| 92 |
css=custom_css,
|
| 93 |
-
|
| 94 |
-
cache_examples=False
|
| 95 |
)
|
| 96 |
|
| 97 |
if __name__ == "__main__":
|
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import threading
|
| 5 |
|
| 6 |
+
# --- 1. MODEL SETUP (CPU COMPATIBLE) ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
MODEL_NAME = "Xerv-AI/MAXWELL"
|
|
|
|
| 8 |
|
| 9 |
+
print("Loading model on CPU... this may take a few minutes.")
|
| 10 |
+
# We load in 8-bit or float16 because 4-bit (bitsandbytes) is GPU-only
|
| 11 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 12 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 13 |
+
MODEL_NAME,
|
| 14 |
+
device_map="cpu",
|
| 15 |
+
torch_dtype=torch.float32 # CPU requires float32 for stability
|
| 16 |
)
|
| 17 |
|
| 18 |
+
# --- 2. INFERENCE LOGIC ---
|
| 19 |
def stream_maxwell(message, history):
|
| 20 |
+
prompt = f"<|im_start|>system\nYou are Maxwell, a highly analytical STEM assistant. Keep your responses very direct and to the point. Wrap your internal thought process in <reasoning> tags.<|im_end|>\n"
|
|
|
|
| 21 |
|
| 22 |
for user_msg, assistant_msg in history:
|
| 23 |
prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
|
| 24 |
|
| 25 |
prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
|
| 26 |
|
| 27 |
+
inputs = tokenizer([prompt], return_tensors="pt").to("cpu")
|
| 28 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 29 |
|
| 30 |
gen_kwargs = dict(
|
| 31 |
**inputs,
|
| 32 |
+
max_new_tokens=512, # Reduced for CPU speed
|
| 33 |
temperature=0.3,
|
| 34 |
do_sample=True,
|
| 35 |
streamer=streamer,
|
|
|
|
| 36 |
)
|
| 37 |
|
|
|
|
| 38 |
thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
|
| 39 |
thread.start()
|
| 40 |
|
| 41 |
partial_text = ""
|
| 42 |
for new_text in streamer:
|
| 43 |
partial_text += new_text
|
|
|
|
|
|
|
|
|
|
| 44 |
display_text = partial_text
|
| 45 |
if "<reasoning>" in display_text:
|
| 46 |
display_text = display_text.replace("<reasoning>", "\n\n<details><summary><b>🔍 Internal Trace</b></summary><i>")
|
| 47 |
if "</reasoning>" in display_text:
|
| 48 |
display_text = display_text.replace("</reasoning>", "</i></details>\n\n")
|
|
|
|
| 49 |
yield display_text
|
| 50 |
|
| 51 |
+
# --- 3. UI DESIGN ---
|
| 52 |
custom_css = """
|
| 53 |
footer {visibility: hidden}
|
| 54 |
.gradio-container {background-color: #121212 !important; color: white !important;}
|
| 55 |
+
details { background: #1A1A1A; border-left: 2px solid #3b82f6; padding: 10px; margin: 10px 0; color: #A0A0A0; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
"""
|
| 57 |
|
| 58 |
demo = gr.ChatInterface(
|
| 59 |
fn=stream_maxwell,
|
| 60 |
+
title="M. (CPU Mode)",
|
| 61 |
+
description="The computational throne is currently on backup power (CPU). Expect slower response times.",
|
|
|
|
| 62 |
css=custom_css,
|
| 63 |
+
theme=gr.themes.Default(primary_hue="blue", neutral_hue="zinc")
|
|
|
|
| 64 |
)
|
| 65 |
|
| 66 |
if __name__ == "__main__":
|