Spaces:
Sleeping
Sleeping
File size: 3,595 Bytes
385d930 17c0138 385d930 699bfa3 3dffc7e 385d930 3dffc7e 385d930 3dffc7e 385d930 699bfa3 17c0138 385d930 699bfa3 3dffc7e 17c0138 385d930 3dffc7e 17c0138 3dffc7e 17c0138 699bfa3 17c0138 385d930 17c0138 3dffc7e 17c0138 699bfa3 3dffc7e 385d930 699bfa3 3dffc7e 699bfa3 3dffc7e 699bfa3 17c0138 3dffc7e 699bfa3 17c0138 699bfa3 17c0138 699bfa3 385d930 699bfa3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
import torch
import time
import psutil
import os
# CONFIGURATION
# We load weights from the GGUF repo, but tokenizer from the ORIGINAL repo
MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # The fix is here
# Global variables for model and tokenizer
model = None
tokenizer = None
load_status = "🔄 Initializing..."
def load_model():
global model, tokenizer, load_status
try:
print(f"Loading tokenizer from {TOKENIZER_ID}...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
print(f"Loading GGUF weights from {MODEL_ID}...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
gguf_file=GGUF_FILE,
torch_dtype=torch.float32,
device_map="cpu"
)
load_status = "✅ Model Loaded Successfully"
except Exception as e:
load_status = f"❌ Error: {str(e)}"
print(load_status)
# Start loading in the background
load_model()
def get_stats():
vm = psutil.virtual_memory()
return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / 16GB"
def chat(message, history):
if model is None:
yield "Model is still loading or failed to load. Check status.", load_status
return
# DeepSeek-R1 Prompt Format
prompt = f"<|begin_of_sentence|><|User|>{message}<|Assistant|><think>\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=1024,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
start_time = time.time()
generated_text = ""
token_count = 0
for new_text in streamer:
generated_text += new_text
token_count += 1
elapsed = time.time() - start_time
tps = token_count / elapsed if elapsed > 0 else 0
stats = f"⏱️ {elapsed:.1f}s | ⚡ {tps:.2f} t/s | {get_stats()} | {load_status}"
yield generated_text, stats
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🚀 DeepSeek-R1 CPU Dashboard (v2.0)")
with gr.Row():
with gr.Column(scale=4):
chatbot = gr.Chatbot(label="Response Console", height=500)
msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...")
with gr.Column(scale=1):
stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}")
gr.Markdown("---")
gr.Markdown("**Note:** First run may take 60s to load weights into RAM.")
clear = gr.Button("Clear Chat")
def respond(message, chat_history):
return "", chat_history + [[message, ""]]
def stream_bot(chat_history):
user_input = chat_history[-1][0]
for content, stats in chat(user_input, chat_history[:-1]):
chat_history[-1][1] = content
yield chat_history, stats
msg.submit(respond, [msg, chatbot], [msg, chatbot]).then(
stream_bot, chatbot, [chatbot, stats_box]
)
clear.click(lambda: None, None, chatbot, queue=False)
if __name__ == "__main__":
demo.queue().launch() |