Small_llm

Sleeping

App Files Files Community

everydaytok commited on Feb 5

Commit

1a324c0

verified ·

1 Parent(s): 7742b92

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -53

app.py CHANGED Viewed

@@ -1,61 +1,41 @@
 import gradio as gr
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
-import json
-import re
-# 1. Download the specific GGUF file
-# Using the 7B Distill version for high math/JSON capability
-model_path = hf_hub_download(
-    repo_id="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
-    filename="DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
-)
-# 2. Initialize the model
-# n_ctx: 2048 (Higher uses more RAM, keep it balanced for a basic instance)
-# n_threads: 2 (Matches the 2 vCPUs on HF Basic instances)
-llm = Llama(
-    model_path=model_path,
-    n_ctx=2048,
-    n_threads=2,
-    verbose=False
 )
 SYSTEM_PROMPT = (
-    "You are a precise assistant. First, think step-by-step inside <think> tags. "
-    "Then, provide the final response strictly as a JSON object. "
-    "JSON format: {\"solution\": \"...\", \"result\": 123}"
 )
-def generate_response(message, history):
-    # Construct the prompt
-    prompt = f"<｜begin_of_sentence｜>system\n{SYSTEM_PROMPT}\n"
-    for user_msg, assistant_msg in history:
-        prompt += f"user\n{user_msg}\nassistant\n{assistant_msg}\n"
-    prompt += f"user\n{message}\nassistant\n<think>\n"
-    # Inference
-    output = llm(
-        prompt,
-        max_tokens=1024,
-        stop=["user\n", "system\n"],
-        echo=False,
-        stream=True
     )
-    response_text = ""
-    for chunk in output:
-        delta = chunk['choices'][0]['text']
-        response_text += delta
-        yield response_text
-# 3. Launch Gradio
-demo = gr.ChatInterface(
-    fn=generate_response,
-    title="DeepSeek-R1 CPU Server",
-    description="Running locally on CPU. 7B Distilled model optimized for Math and JSON.",
-    examples=["Calculate the compound interest for $1000 at 5% for 3 years.", "Solve 2x + 5 = 15"]
-)
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+# This model is great for Math/JSON and fits in your RAM
+model_id = "unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF"
+filename = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
+print("Loading model... this might take a minute on a basic instance.")
+# Loading via transformers native GGUF support
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    gguf_file=filename,
+    torch_dtype=torch.float32, # CPU needs float32 or bfloat16
+    device_map="cpu"
 )
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 SYSTEM_PROMPT = (
+    "You are a math assistant. Think step-by-step in <think> tags, "
+    "then output valid JSON: {\"reasoning\": \"...\", \"answer\": \"...\"}"
 )
+def chat(message, history):
+    # Prepare prompt
+    prompt = f"system\n{SYSTEM_PROMPT}\nuser\n{message}\nassistant\n<think>\n"
+    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
+    # Generate
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=1024,
+        pad_token_id=tokenizer.eos_token_id
     )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract only the assistant's part
+    return response.split("assistant\n")[-1]
+demo = gr.ChatInterface(fn=chat, title="DeepSeek-R1 CPU")
+demo.launch()