Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| from threading import Thread | |
| import torch | |
| import time | |
| import psutil | |
| import os | |
| # CONFIGURATION | |
| # We load weights from the GGUF repo, but tokenizer from the ORIGINAL repo | |
| MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF" | |
| GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf" | |
| TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # The fix is here | |
| # Global variables for model and tokenizer | |
| model = None | |
| tokenizer = None | |
| load_status = "🔄 Initializing..." | |
| def load_model(): | |
| global model, tokenizer, load_status | |
| try: | |
| print(f"Loading tokenizer from {TOKENIZER_ID}...") | |
| tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID) | |
| print(f"Loading GGUF weights from {MODEL_ID}...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| gguf_file=GGUF_FILE, | |
| torch_dtype=torch.float32, | |
| device_map="cpu" | |
| ) | |
| load_status = "✅ Model Loaded Successfully" | |
| except Exception as e: | |
| load_status = f"❌ Error: {str(e)}" | |
| print(load_status) | |
| # Start loading in the background | |
| load_model() | |
| def get_stats(): | |
| vm = psutil.virtual_memory() | |
| return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / 16GB" | |
| def chat(message, history): | |
| if model is None: | |
| yield "Model is still loading or failed to load. Check status.", load_status | |
| return | |
| # DeepSeek-R1 Prompt Format | |
| prompt = f"<|begin_of_sentence|><|User|>{message}<|Assistant|><think>\n" | |
| inputs = tokenizer(prompt, return_tensors="pt").to("cpu") | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| generation_kwargs = dict( | |
| inputs, | |
| streamer=streamer, | |
| max_new_tokens=1024, | |
| do_sample=False, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| start_time = time.time() | |
| generated_text = "" | |
| token_count = 0 | |
| for new_text in streamer: | |
| generated_text += new_text | |
| token_count += 1 | |
| elapsed = time.time() - start_time | |
| tps = token_count / elapsed if elapsed > 0 else 0 | |
| stats = f"⏱️ {elapsed:.1f}s | ⚡ {tps:.2f} t/s | {get_stats()} | {load_status}" | |
| yield generated_text, stats | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🚀 DeepSeek-R1 CPU Dashboard (v2.0)") | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| chatbot = gr.Chatbot(label="Response Console", height=500) | |
| msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...") | |
| with gr.Column(scale=1): | |
| stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}") | |
| gr.Markdown("---") | |
| gr.Markdown("**Note:** First run may take 60s to load weights into RAM.") | |
| clear = gr.Button("Clear Chat") | |
| def respond(message, chat_history): | |
| return "", chat_history + [[message, ""]] | |
| def stream_bot(chat_history): | |
| user_input = chat_history[-1][0] | |
| for content, stats in chat(user_input, chat_history[:-1]): | |
| chat_history[-1][1] = content | |
| yield chat_history, stats | |
| msg.submit(respond, [msg, chatbot], [msg, chatbot]).then( | |
| stream_bot, chatbot, [chatbot, stats_box] | |
| ) | |
| clear.click(lambda: None, None, chatbot, queue=False) | |
| if __name__ == "__main__": | |
| demo.queue().launch() |