Spaces:

Xerv-AI
/

Maxwell

Running

App Files Files Community

Phase-Technologies commited on 16 days ago

Commit

b6b2f09

verified ·

1 Parent(s): c9f8abe

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -49

app.py CHANGED Viewed

@@ -1,97 +1,66 @@
-import sys
 import torch
-from transformers import TextIteratorStreamer
-from unsloth import FastLanguageModel
 import gradio as gr
 import threading
-# --- 1. CRITICAL LIBRARY PATCHES ---
-# These patches fix common initialization errors in specific environments
-try:
-    import unsloth_zoo.rl_replacements
-    for func in ['_unsloth_get_mm_token_id', '_unsloth_fix_mm_token_type_ids']:
-        if not hasattr(unsloth_zoo.rl_replacements, func):
-            setattr(unsloth_zoo.rl_replacements, func, lambda *args, **kwargs: None)
-except:
-    pass
-# --- 2. MODEL SETUP (Optimized for HF T4 GPUs) ---
 MODEL_NAME = "Xerv-AI/MAXWELL"
-MAX_CONTEXT = 4096
-model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name = MODEL_NAME,
-    max_seq_length = MAX_CONTEXT,
-    load_in_4bit = True,
-    device_map = "auto",
-    attn_implementation = "sdpa"
 )
-# --- 3. INFERENCE LOGIC ---
 def stream_maxwell(message, history):
-    # Construct the prompt using the Maxwell/Qwen2.5 Chat Template
-    prompt = "<|im_start|>system\nYou are Maxwell, a highly analytical STEM assistant. Keep your responses very direct and to the point. Wrap your internal thought process in <reasoning> tags.<|im_end|>\n"
     for user_msg, assistant_msg in history:
         prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
     prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **inputs,
-        max_new_tokens=1024,
         temperature=0.3,
         do_sample=True,
         streamer=streamer,
-        pad_token_id=tokenizer.eos_token_id
     )
-    # Run generation in a separate thread to allow Gradio to stream tokens
     thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
-        # UI Transformation: Wrap <reasoning> tags in a Gradio-friendly style
-        # Note: Gradio markdown supports HTML for collapsible sections
         display_text = partial_text
         if "<reasoning>" in display_text:
             display_text = display_text.replace("<reasoning>", "\n\n<details><summary><b>🔍 Internal Trace</b></summary><i>")
         if "</reasoning>" in display_text:
             display_text = display_text.replace("</reasoning>", "</i></details>\n\n")
         yield display_text
-# --- 4. UI DESIGN (Maxwell Aesthetic) ---
 custom_css = """
 footer {visibility: hidden}
 .gradio-container {background-color: #121212 !important; color: white !important;}
-.message-user {background-color: #242424 !important; border-radius: 20px 20px 0px 20px !important;}
-.message-bot {background-color: transparent !important; font-size: 1.1em !important;}
-details {
-    background: #1A1A1A;
-    border-left: 2px solid #3b82f6;
-    padding: 10px;
-    border-radius: 0 8px 8px 0;
-    margin: 10px 0;
-    color: #A0A0A0;
-}
-summary { cursor: pointer; color: #5c94ff; font-weight: bold; }
 """
 demo = gr.ChatInterface(
     fn=stream_maxwell,
-    title="M.",
-    description="Welcome back, Sire. The computational throne awaits.",
-    theme=gr.themes.Default(primary_hue="blue", neutral_hue="zinc"),
     css=custom_css,
-    examples=["Solve for x: 2x^2 - 5x + 3 = 0", "Explain the concept of quantum entanglement simply."],
-    cache_examples=False
 )
 if __name__ == "__main__":

 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
 import threading
+# --- 1. MODEL SETUP (CPU COMPATIBLE) ---
 MODEL_NAME = "Xerv-AI/MAXWELL"
+print("Loading model on CPU... this may take a few minutes.")
+# We load in 8-bit or float16 because 4-bit (bitsandbytes) is GPU-only
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    device_map="cpu",
+    torch_dtype=torch.float32  # CPU requires float32 for stability
 )
+# --- 2. INFERENCE LOGIC ---
 def stream_maxwell(message, history):
+    prompt = f"<|im_start|>system\nYou are Maxwell, a highly analytical STEM assistant. Keep your responses very direct and to the point. Wrap your internal thought process in <reasoning> tags.<|im_end|>\n"
     for user_msg, assistant_msg in history:
         prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
     prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    inputs = tokenizer([prompt], return_tensors="pt").to("cpu")
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **inputs,
+        max_new_tokens=512, # Reduced for CPU speed
         temperature=0.3,
         do_sample=True,
         streamer=streamer,
     )
     thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         display_text = partial_text
         if "<reasoning>" in display_text:
             display_text = display_text.replace("<reasoning>", "\n\n<details><summary><b>🔍 Internal Trace</b></summary><i>")
         if "</reasoning>" in display_text:
             display_text = display_text.replace("</reasoning>", "</i></details>\n\n")
         yield display_text
+# --- 3. UI DESIGN ---
 custom_css = """
 footer {visibility: hidden}
 .gradio-container {background-color: #121212 !important; color: white !important;}
+details { background: #1A1A1A; border-left: 2px solid #3b82f6; padding: 10px; margin: 10px 0; color: #A0A0A0; }
 """
 demo = gr.ChatInterface(
     fn=stream_maxwell,
+    title="M. (CPU Mode)",
+    description="The computational throne is currently on backup power (CPU). Expect slower response times.",
     css=custom_css,
+    theme=gr.themes.Default(primary_hue="blue", neutral_hue="zinc")
 )
 if __name__ == "__main__":