everydaytok commited on
Commit
17c0138
·
verified ·
1 Parent(s): 62df132

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -55
app.py CHANGED
@@ -1,14 +1,15 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
3
  import torch
4
  import time
5
  import psutil
6
 
7
- # 1.5B is the "sweet spot" for speed vs intelligence on CPU
8
  model_id = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
9
  filename = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
10
 
11
- print("Initializing model...")
12
  model = AutoModelForCausalLM.from_pretrained(
13
  model_id,
14
  gguf_file=filename,
@@ -17,74 +18,67 @@ model = AutoModelForCausalLM.from_pretrained(
17
  )
18
  tokenizer = AutoTokenizer.from_pretrained(model_id)
19
 
20
- SYSTEM_PROMPT = "You are a math assistant. Think in <think> tags, then output JSON."
21
-
22
  def get_stats():
23
  vm = psutil.virtual_memory()
24
- return f"RAM Usage: {vm.percent}% ({vm.used / 1024**3:.1f}GB / {vm.total / 1024**3:.1f}GB)"
25
 
26
  def chat(message, history):
27
- # Constructing the prompt
28
- prompt = f"system\n{SYSTEM_PROMPT}\nuser\n{message}\nassistant\n<think>\n"
 
29
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
30
- input_ids = inputs.input_ids
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  start_time = time.time()
33
- generated_tokens = 0
34
- full_response = ""
35
 
36
- # Simple streaming generation logic
37
- # We use a loop to yield updates to the UI
38
- for _ in range(512): # max_new_tokens
39
- output = model.generate(
40
- input_ids,
41
- max_new_tokens=1,
42
- do_sample=False,
43
- pad_token_id=tokenizer.eos_token_id
44
- )
45
-
46
- new_token_id = output[0][-1]
47
- if new_token_id == tokenizer.eos_token_id:
48
- break
49
-
50
- new_token = tokenizer.decode(new_token_id)
51
- full_response += new_token
52
- generated_tokens += 1
53
-
54
- # Calculate stats
55
- elapsed_time = time.time() - start_time
56
- tps = generated_tokens / elapsed_time if elapsed_time > 0 else 0
57
- stats = f"⏱️ {elapsed_time:.1f}s | ⚡ {tps:.2f} tokens/s | {get_stats()}"
58
-
59
- # Prepare for next iteration
60
- input_ids = torch.cat([input_ids, output[:, -1:]], dim=-1)
61
-
62
- yield full_response, stats
63
 
64
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
65
- gr.Markdown("# 🧠 DeepSeek-R1 CPU Dashboard")
 
66
 
67
  with gr.Row():
68
  with gr.Column(scale=4):
69
- chatbot = gr.Chatbot(label="Reasoning & JSON Output")
70
- msg = gr.Textbox(label="Ask a Math Question", placeholder="e.g., What is the square root of 144 plus 5?")
71
  with gr.Column(scale=1):
72
- stats_output = gr.Markdown("### System Stats\nWaiting for input...")
73
- clear = gr.Button("Clear Chat")
74
 
75
- def user_input(user_message, history):
76
- return "", history + [[user_message, None]]
 
77
 
78
- def bot_response(history):
79
- user_message = history[-1][0]
80
- history[-1][1] = ""
81
- # stream the response
82
- for chunk, stats in chat(user_message, history[:-1]):
83
- history[-1][1] = chunk
84
- yield history, stats
85
 
86
- msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
87
- bot_response, chatbot, [chatbot, stats_output]
88
  )
89
  clear.click(lambda: None, None, chatbot, queue=False)
90
 
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
3
+ from threading import Thread
4
  import torch
5
  import time
6
  import psutil
7
 
8
+ # 1.5B is perfect for CPU stability
9
  model_id = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
10
  filename = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
11
 
12
+ print("Loading stable model...")
13
  model = AutoModelForCausalLM.from_pretrained(
14
  model_id,
15
  gguf_file=filename,
 
18
  )
19
  tokenizer = AutoTokenizer.from_pretrained(model_id)
20
 
 
 
21
  def get_stats():
22
  vm = psutil.virtual_memory()
23
+ return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / 16GB"
24
 
25
  def chat(message, history):
26
+ # Proper DeepSeek-R1/Qwen prompt format
27
+ prompt = f"<|begin_of_sentence|><|User|>{message}<|Assistant|><think>\n"
28
+
29
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
30
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
31
 
32
+ # Generation kwargs
33
+ generation_kwargs = dict(
34
+ inputs,
35
+ streamer=streamer,
36
+ max_new_tokens=1024,
37
+ do_sample=False, # Keeps it fast and logical for math
38
+ pad_token_id=tokenizer.eos_token_id
39
+ )
40
+
41
+ # Start generation in a background thread
42
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
43
+ thread.start()
44
+
45
  start_time = time.time()
46
+ generated_text = ""
47
+ token_count = 0
48
 
49
+ # Yield from the streamer for real-time UI updates
50
+ for new_text in streamer:
51
+ generated_text += new_text
52
+ token_count += 1
53
+ elapsed = time.time() - start_time
54
+ tps = token_count / elapsed if elapsed > 0 else 0
55
+ stats = f"⏱️ {elapsed:.1f}s | ⚡ {tps:.2f} t/s | {get_stats()}"
56
+ yield generated_text, stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ # Custom Gradio Blocks for better UI
59
+ with gr.Blocks(theme=gr.themes.Default()) as demo:
60
+ gr.Markdown("# 🚀 DeepSeek-R1 CPU Optimizer")
61
 
62
  with gr.Row():
63
  with gr.Column(scale=4):
64
+ chatbot = gr.Chatbot(label="Response (Thinking + JSON)")
65
+ msg = gr.Textbox(label="Input", placeholder="Enter your math problem...")
66
  with gr.Column(scale=1):
67
+ stats_box = gr.Markdown("### Live Stats\nWaiting...")
68
+ clear = gr.Button("Clear")
69
 
70
+ def respond(message, chat_history):
71
+ chat_history.append((message, ""))
72
+ return "", chat_history
73
 
74
+ def stream_bot(chat_history):
75
+ user_input = chat_history[-1][0]
76
+ for content, stats in chat(user_input, chat_history[:-1]):
77
+ chat_history[-1][1] = content
78
+ yield chat_history, stats
 
 
79
 
80
+ msg.submit(respond, [msg, chatbot], [msg, chatbot]).then(
81
+ stream_bot, chatbot, [chatbot, stats_box]
82
  )
83
  clear.click(lambda: None, None, chatbot, queue=False)
84