everydaytok commited on
Commit
3dffc7e
·
verified ·
1 Parent(s): 17c0138

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -28
app.py CHANGED
@@ -4,41 +4,62 @@ from threading import Thread
4
  import torch
5
  import time
6
  import psutil
 
7
 
8
- # 1.5B is perfect for CPU stability
9
- model_id = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
10
- filename = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
 
 
11
 
12
- print("Loading stable model...")
13
- model = AutoModelForCausalLM.from_pretrained(
14
- model_id,
15
- gguf_file=filename,
16
- torch_dtype=torch.float32,
17
- device_map="cpu"
18
- )
19
- tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def get_stats():
22
  vm = psutil.virtual_memory()
23
  return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / 16GB"
24
 
25
  def chat(message, history):
26
- # Proper DeepSeek-R1/Qwen prompt format
 
 
 
 
27
  prompt = f"<|begin_of_sentence|><|User|>{message}<|Assistant|><think>\n"
28
-
29
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
30
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
31
 
32
- # Generation kwargs
33
  generation_kwargs = dict(
34
  inputs,
35
  streamer=streamer,
36
  max_new_tokens=1024,
37
- do_sample=False, # Keeps it fast and logical for math
38
  pad_token_id=tokenizer.eos_token_id
39
  )
40
 
41
- # Start generation in a background thread
42
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
43
  thread.start()
44
 
@@ -46,30 +67,29 @@ def chat(message, history):
46
  generated_text = ""
47
  token_count = 0
48
 
49
- # Yield from the streamer for real-time UI updates
50
  for new_text in streamer:
51
  generated_text += new_text
52
  token_count += 1
53
  elapsed = time.time() - start_time
54
  tps = token_count / elapsed if elapsed > 0 else 0
55
- stats = f"⏱️ {elapsed:.1f}s | ⚡ {tps:.2f} t/s | {get_stats()}"
56
  yield generated_text, stats
57
 
58
- # Custom Gradio Blocks for better UI
59
- with gr.Blocks(theme=gr.themes.Default()) as demo:
60
- gr.Markdown("# 🚀 DeepSeek-R1 CPU Optimizer")
61
 
62
  with gr.Row():
63
  with gr.Column(scale=4):
64
- chatbot = gr.Chatbot(label="Response (Thinking + JSON)")
65
- msg = gr.Textbox(label="Input", placeholder="Enter your math problem...")
66
  with gr.Column(scale=1):
67
- stats_box = gr.Markdown("### Live Stats\nWaiting...")
68
- clear = gr.Button("Clear")
 
 
69
 
70
  def respond(message, chat_history):
71
- chat_history.append((message, ""))
72
- return "", chat_history
73
 
74
  def stream_bot(chat_history):
75
  user_input = chat_history[-1][0]
 
4
  import torch
5
  import time
6
  import psutil
7
+ import os
8
 
9
+ # CONFIGURATION
10
+ # We load weights from the GGUF repo, but tokenizer from the ORIGINAL repo
11
+ MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
12
+ GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
13
+ TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # The fix is here
14
 
15
+ # Global variables for model and tokenizer
16
+ model = None
17
+ tokenizer = None
18
+ load_status = "🔄 Initializing..."
19
+
20
+ def load_model():
21
+ global model, tokenizer, load_status
22
+ try:
23
+ print(f"Loading tokenizer from {TOKENIZER_ID}...")
24
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
25
+
26
+ print(f"Loading GGUF weights from {MODEL_ID}...")
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ MODEL_ID,
29
+ gguf_file=GGUF_FILE,
30
+ torch_dtype=torch.float32,
31
+ device_map="cpu"
32
+ )
33
+ load_status = "✅ Model Loaded Successfully"
34
+ except Exception as e:
35
+ load_status = f"❌ Error: {str(e)}"
36
+ print(load_status)
37
+
38
+ # Start loading in the background
39
+ load_model()
40
 
41
  def get_stats():
42
  vm = psutil.virtual_memory()
43
  return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / 16GB"
44
 
45
  def chat(message, history):
46
+ if model is None:
47
+ yield "Model is still loading or failed to load. Check status.", load_status
48
+ return
49
+
50
+ # DeepSeek-R1 Prompt Format
51
  prompt = f"<|begin_of_sentence|><|User|>{message}<|Assistant|><think>\n"
 
52
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
 
53
 
54
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
55
  generation_kwargs = dict(
56
  inputs,
57
  streamer=streamer,
58
  max_new_tokens=1024,
59
+ do_sample=False,
60
  pad_token_id=tokenizer.eos_token_id
61
  )
62
 
 
63
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
64
  thread.start()
65
 
 
67
  generated_text = ""
68
  token_count = 0
69
 
 
70
  for new_text in streamer:
71
  generated_text += new_text
72
  token_count += 1
73
  elapsed = time.time() - start_time
74
  tps = token_count / elapsed if elapsed > 0 else 0
75
+ stats = f"⏱️ {elapsed:.1f}s | ⚡ {tps:.2f} t/s | {get_stats()} | {load_status}"
76
  yield generated_text, stats
77
 
78
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
79
+ gr.Markdown("# 🚀 DeepSeek-R1 CPU Dashboard (v2.0)")
 
80
 
81
  with gr.Row():
82
  with gr.Column(scale=4):
83
+ chatbot = gr.Chatbot(label="Response Console", height=500)
84
+ msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...")
85
  with gr.Column(scale=1):
86
+ stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}")
87
+ gr.Markdown("---")
88
+ gr.Markdown("**Note:** First run may take 60s to load weights into RAM.")
89
+ clear = gr.Button("Clear Chat")
90
 
91
  def respond(message, chat_history):
92
+ return "", chat_history + [[message, ""]]
 
93
 
94
  def stream_bot(chat_history):
95
  user_input = chat_history[-1][0]