everydaytok commited on
Commit
1a324c0
·
verified ·
1 Parent(s): 7742b92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -53
app.py CHANGED
@@ -1,61 +1,41 @@
1
  import gradio as gr
2
- from huggingface_hub import hf_hub_download
3
- from llama_cpp import Llama
4
- import json
5
- import re
6
-
7
- # 1. Download the specific GGUF file
8
- # Using the 7B Distill version for high math/JSON capability
9
- model_path = hf_hub_download(
10
- repo_id="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
11
- filename="DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
12
- )
13
-
14
- # 2. Initialize the model
15
- # n_ctx: 2048 (Higher uses more RAM, keep it balanced for a basic instance)
16
- # n_threads: 2 (Matches the 2 vCPUs on HF Basic instances)
17
- llm = Llama(
18
- model_path=model_path,
19
- n_ctx=2048,
20
- n_threads=2,
21
- verbose=False
22
  )
 
23
 
24
  SYSTEM_PROMPT = (
25
- "You are a precise assistant. First, think step-by-step inside <think> tags. "
26
- "Then, provide the final response strictly as a JSON object. "
27
- "JSON format: {\"solution\": \"...\", \"result\": 123}"
28
  )
29
 
30
- def generate_response(message, history):
31
- # Construct the prompt
32
- prompt = f"<|begin_of_sentence|>system\n{SYSTEM_PROMPT}\n"
33
- for user_msg, assistant_msg in history:
34
- prompt += f"user\n{user_msg}\nassistant\n{assistant_msg}\n"
35
- prompt += f"user\n{message}\nassistant\n<think>\n"
36
-
37
- # Inference
38
- output = llm(
39
- prompt,
40
- max_tokens=1024,
41
- stop=["user\n", "system\n"],
42
- echo=False,
43
- stream=True
44
  )
 
 
 
 
45
 
46
- response_text = ""
47
- for chunk in output:
48
- delta = chunk['choices'][0]['text']
49
- response_text += delta
50
- yield response_text
51
-
52
- # 3. Launch Gradio
53
- demo = gr.ChatInterface(
54
- fn=generate_response,
55
- title="DeepSeek-R1 CPU Server",
56
- description="Running locally on CPU. 7B Distilled model optimized for Math and JSON.",
57
- examples=["Calculate the compound interest for $1000 at 5% for 3 years.", "Solve 2x + 5 = 15"]
58
- )
59
-
60
- if __name__ == "__main__":
61
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch
4
+
5
+ # This model is great for Math/JSON and fits in your RAM
6
+ model_id = "unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF"
7
+ filename = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
8
+
9
+ print("Loading model... this might take a minute on a basic instance.")
10
+ # Loading via transformers native GGUF support
11
+ model = AutoModelForCausalLM.from_pretrained(
12
+ model_id,
13
+ gguf_file=filename,
14
+ torch_dtype=torch.float32, # CPU needs float32 or bfloat16
15
+ device_map="cpu"
 
 
 
 
 
 
16
  )
17
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
18
 
19
  SYSTEM_PROMPT = (
20
+ "You are a math assistant. Think step-by-step in <think> tags, "
21
+ "then output valid JSON: {\"reasoning\": \"...\", \"answer\": \"...\"}"
 
22
  )
23
 
24
+ def chat(message, history):
25
+ # Prepare prompt
26
+ prompt = f"system\n{SYSTEM_PROMPT}\nuser\n{message}\nassistant\n<think>\n"
27
+ inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
28
+
29
+ # Generate
30
+ outputs = model.generate(
31
+ **inputs,
32
+ max_new_tokens=1024,
33
+ pad_token_id=tokenizer.eos_token_id
 
 
 
 
34
  )
35
+
36
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
37
+ # Extract only the assistant's part
38
+ return response.split("assistant\n")[-1]
39
 
40
+ demo = gr.ChatInterface(fn=chat, title="DeepSeek-R1 CPU")
41
+ demo.launch()