Small_llm / app.py
everydaytok's picture
Update app.py
3dffc7e verified
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
import torch
import time
import psutil
import os
# CONFIGURATION
# We load weights from the GGUF repo, but tokenizer from the ORIGINAL repo
MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # The fix is here
# Global variables for model and tokenizer
model = None
tokenizer = None
load_status = "🔄 Initializing..."
def load_model():
global model, tokenizer, load_status
try:
print(f"Loading tokenizer from {TOKENIZER_ID}...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
print(f"Loading GGUF weights from {MODEL_ID}...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
gguf_file=GGUF_FILE,
torch_dtype=torch.float32,
device_map="cpu"
)
load_status = "✅ Model Loaded Successfully"
except Exception as e:
load_status = f"❌ Error: {str(e)}"
print(load_status)
# Start loading in the background
load_model()
def get_stats():
vm = psutil.virtual_memory()
return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / 16GB"
def chat(message, history):
if model is None:
yield "Model is still loading or failed to load. Check status.", load_status
return
# DeepSeek-R1 Prompt Format
prompt = f"<|begin_of_sentence|><|User|>{message}<|Assistant|><think>\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=1024,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
start_time = time.time()
generated_text = ""
token_count = 0
for new_text in streamer:
generated_text += new_text
token_count += 1
elapsed = time.time() - start_time
tps = token_count / elapsed if elapsed > 0 else 0
stats = f"⏱️ {elapsed:.1f}s | ⚡ {tps:.2f} t/s | {get_stats()} | {load_status}"
yield generated_text, stats
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🚀 DeepSeek-R1 CPU Dashboard (v2.0)")
with gr.Row():
with gr.Column(scale=4):
chatbot = gr.Chatbot(label="Response Console", height=500)
msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...")
with gr.Column(scale=1):
stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}")
gr.Markdown("---")
gr.Markdown("**Note:** First run may take 60s to load weights into RAM.")
clear = gr.Button("Clear Chat")
def respond(message, chat_history):
return "", chat_history + [[message, ""]]
def stream_bot(chat_history):
user_input = chat_history[-1][0]
for content, stats in chat(user_input, chat_history[:-1]):
chat_history[-1][1] = content
yield chat_history, stats
msg.submit(respond, [msg, chatbot], [msg, chatbot]).then(
stream_bot, chatbot, [chatbot, stats_box]
)
clear.click(lambda: None, None, chatbot, queue=False)
if __name__ == "__main__":
demo.queue().launch()