Phase-Technologies commited on
Commit
b6b2f09
·
verified ·
1 Parent(s): c9f8abe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -49
app.py CHANGED
@@ -1,97 +1,66 @@
1
- import sys
2
  import torch
3
- from transformers import TextIteratorStreamer
4
- from unsloth import FastLanguageModel
5
  import gradio as gr
6
  import threading
7
 
8
- # --- 1. CRITICAL LIBRARY PATCHES ---
9
- # These patches fix common initialization errors in specific environments
10
- try:
11
- import unsloth_zoo.rl_replacements
12
- for func in ['_unsloth_get_mm_token_id', '_unsloth_fix_mm_token_type_ids']:
13
- if not hasattr(unsloth_zoo.rl_replacements, func):
14
- setattr(unsloth_zoo.rl_replacements, func, lambda *args, **kwargs: None)
15
- except:
16
- pass
17
-
18
- # --- 2. MODEL SETUP (Optimized for HF T4 GPUs) ---
19
  MODEL_NAME = "Xerv-AI/MAXWELL"
20
- MAX_CONTEXT = 4096
21
 
22
- model, tokenizer = FastLanguageModel.from_pretrained(
23
- model_name = MODEL_NAME,
24
- max_seq_length = MAX_CONTEXT,
25
- load_in_4bit = True,
26
- device_map = "auto",
27
- attn_implementation = "sdpa"
 
28
  )
29
 
30
- # --- 3. INFERENCE LOGIC ---
31
  def stream_maxwell(message, history):
32
- # Construct the prompt using the Maxwell/Qwen2.5 Chat Template
33
- prompt = "<|im_start|>system\nYou are Maxwell, a highly analytical STEM assistant. Keep your responses very direct and to the point. Wrap your internal thought process in <reasoning> tags.<|im_end|>\n"
34
 
35
  for user_msg, assistant_msg in history:
36
  prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
37
 
38
  prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
39
 
40
- inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
41
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
42
 
43
  gen_kwargs = dict(
44
  **inputs,
45
- max_new_tokens=1024,
46
  temperature=0.3,
47
  do_sample=True,
48
  streamer=streamer,
49
- pad_token_id=tokenizer.eos_token_id
50
  )
51
 
52
- # Run generation in a separate thread to allow Gradio to stream tokens
53
  thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
54
  thread.start()
55
 
56
  partial_text = ""
57
  for new_text in streamer:
58
  partial_text += new_text
59
-
60
- # UI Transformation: Wrap <reasoning> tags in a Gradio-friendly style
61
- # Note: Gradio markdown supports HTML for collapsible sections
62
  display_text = partial_text
63
  if "<reasoning>" in display_text:
64
  display_text = display_text.replace("<reasoning>", "\n\n<details><summary><b>🔍 Internal Trace</b></summary><i>")
65
  if "</reasoning>" in display_text:
66
  display_text = display_text.replace("</reasoning>", "</i></details>\n\n")
67
-
68
  yield display_text
69
 
70
- # --- 4. UI DESIGN (Maxwell Aesthetic) ---
71
  custom_css = """
72
  footer {visibility: hidden}
73
  .gradio-container {background-color: #121212 !important; color: white !important;}
74
- .message-user {background-color: #242424 !important; border-radius: 20px 20px 0px 20px !important;}
75
- .message-bot {background-color: transparent !important; font-size: 1.1em !important;}
76
- details {
77
- background: #1A1A1A;
78
- border-left: 2px solid #3b82f6;
79
- padding: 10px;
80
- border-radius: 0 8px 8px 0;
81
- margin: 10px 0;
82
- color: #A0A0A0;
83
- }
84
- summary { cursor: pointer; color: #5c94ff; font-weight: bold; }
85
  """
86
 
87
  demo = gr.ChatInterface(
88
  fn=stream_maxwell,
89
- title="M.",
90
- description="Welcome back, Sire. The computational throne awaits.",
91
- theme=gr.themes.Default(primary_hue="blue", neutral_hue="zinc"),
92
  css=custom_css,
93
- examples=["Solve for x: 2x^2 - 5x + 3 = 0", "Explain the concept of quantum entanglement simply."],
94
- cache_examples=False
95
  )
96
 
97
  if __name__ == "__main__":
 
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
3
  import gradio as gr
4
  import threading
5
 
6
+ # --- 1. MODEL SETUP (CPU COMPATIBLE) ---
 
 
 
 
 
 
 
 
 
 
7
  MODEL_NAME = "Xerv-AI/MAXWELL"
 
8
 
9
+ print("Loading model on CPU... this may take a few minutes.")
10
+ # We load in 8-bit or float16 because 4-bit (bitsandbytes) is GPU-only
11
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ MODEL_NAME,
14
+ device_map="cpu",
15
+ torch_dtype=torch.float32 # CPU requires float32 for stability
16
  )
17
 
18
+ # --- 2. INFERENCE LOGIC ---
19
  def stream_maxwell(message, history):
20
+ prompt = f"<|im_start|>system\nYou are Maxwell, a highly analytical STEM assistant. Keep your responses very direct and to the point. Wrap your internal thought process in <reasoning> tags.<|im_end|>\n"
 
21
 
22
  for user_msg, assistant_msg in history:
23
  prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
24
 
25
  prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
26
 
27
+ inputs = tokenizer([prompt], return_tensors="pt").to("cpu")
28
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
29
 
30
  gen_kwargs = dict(
31
  **inputs,
32
+ max_new_tokens=512, # Reduced for CPU speed
33
  temperature=0.3,
34
  do_sample=True,
35
  streamer=streamer,
 
36
  )
37
 
 
38
  thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
39
  thread.start()
40
 
41
  partial_text = ""
42
  for new_text in streamer:
43
  partial_text += new_text
 
 
 
44
  display_text = partial_text
45
  if "<reasoning>" in display_text:
46
  display_text = display_text.replace("<reasoning>", "\n\n<details><summary><b>🔍 Internal Trace</b></summary><i>")
47
  if "</reasoning>" in display_text:
48
  display_text = display_text.replace("</reasoning>", "</i></details>\n\n")
 
49
  yield display_text
50
 
51
+ # --- 3. UI DESIGN ---
52
  custom_css = """
53
  footer {visibility: hidden}
54
  .gradio-container {background-color: #121212 !important; color: white !important;}
55
+ details { background: #1A1A1A; border-left: 2px solid #3b82f6; padding: 10px; margin: 10px 0; color: #A0A0A0; }
 
 
 
 
 
 
 
 
 
 
56
  """
57
 
58
  demo = gr.ChatInterface(
59
  fn=stream_maxwell,
60
+ title="M. (CPU Mode)",
61
+ description="The computational throne is currently on backup power (CPU). Expect slower response times.",
 
62
  css=custom_css,
63
+ theme=gr.themes.Default(primary_hue="blue", neutral_hue="zinc")
 
64
  )
65
 
66
  if __name__ == "__main__":