Small_llm / app1.py
everydaytok's picture
Rename app.py to app1.py
c039633 verified
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# This model is great for Math/JSON and fits in your RAM
model_id = "unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF"
filename = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
print("Loading model... this might take a minute on a basic instance.")
# Loading via transformers native GGUF support
model = AutoModelForCausalLM.from_pretrained(
model_id,
gguf_file=filename,
torch_dtype=torch.float32, # CPU needs float32 or bfloat16
device_map="cpu"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
SYSTEM_PROMPT = (
"You are a math assistant. Think step-by-step in <think> tags, "
"then output valid JSON: {\"reasoning\": \"...\", \"answer\": \"...\"}"
)
def chat(message, history):
# Prepare prompt
prompt = f"system\n{SYSTEM_PROMPT}\nuser\n{message}\nassistant\n<think>\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
# Generate
outputs = model.generate(
**inputs,
max_new_tokens=1024,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the assistant's part
return response.split("assistant\n")[-1]
demo = gr.ChatInterface(fn=chat, title="DeepSeek-R1 CPU")
demo.launch()