import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread import torch import time import psutil import os # CONFIGURATION # We load weights from the GGUF repo, but tokenizer from the ORIGINAL repo MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF" GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf" TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # The fix is here # Global variables for model and tokenizer model = None tokenizer = None load_status = "🔄 Initializing..." def load_model(): global model, tokenizer, load_status try: print(f"Loading tokenizer from {TOKENIZER_ID}...") tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID) print(f"Loading GGUF weights from {MODEL_ID}...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, gguf_file=GGUF_FILE, torch_dtype=torch.float32, device_map="cpu" ) load_status = "✅ Model Loaded Successfully" except Exception as e: load_status = f"❌ Error: {str(e)}" print(load_status) # Start loading in the background load_model() def get_stats(): vm = psutil.virtual_memory() return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / 16GB" def chat(message, history): if model is None: yield "Model is still loading or failed to load. Check status.", load_status return # DeepSeek-R1 Prompt Format prompt = f"<|begin_of_sentence|><|User|>{message}<|Assistant|>\n" inputs = tokenizer(prompt, return_tensors="pt").to("cpu") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( inputs, streamer=streamer, max_new_tokens=1024, do_sample=False, pad_token_id=tokenizer.eos_token_id ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() start_time = time.time() generated_text = "" token_count = 0 for new_text in streamer: generated_text += new_text token_count += 1 elapsed = time.time() - start_time tps = token_count / elapsed if elapsed > 0 else 0 stats = f"⏱️ {elapsed:.1f}s | ⚡ {tps:.2f} t/s | {get_stats()} | {load_status}" yield generated_text, stats with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🚀 DeepSeek-R1 CPU Dashboard (v2.0)") with gr.Row(): with gr.Column(scale=4): chatbot = gr.Chatbot(label="Response Console", height=500) msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...") with gr.Column(scale=1): stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}") gr.Markdown("---") gr.Markdown("**Note:** First run may take 60s to load weights into RAM.") clear = gr.Button("Clear Chat") def respond(message, chat_history): return "", chat_history + [[message, ""]] def stream_bot(chat_history): user_input = chat_history[-1][0] for content, stats in chat(user_input, chat_history[:-1]): chat_history[-1][1] = content yield chat_history, stats msg.submit(respond, [msg, chatbot], [msg, chatbot]).then( stream_bot, chatbot, [chatbot, stats_box] ) clear.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.queue().launch()