Spaces:
Running
Running
File size: 6,022 Bytes
385d930 17c0138 385d930 699bfa3 3dffc7e 385d930 f4f2601 3dffc7e f4f2601 385d930 3dffc7e f4f2601 3dffc7e f4f2601 385d930 699bfa3 f4f2601 385d930 f4f2601 699bfa3 3dffc7e 17c0138 385d930 3dffc7e 17c0138 3dffc7e 17c0138 699bfa3 17c0138 385d930 17c0138 3dffc7e 17c0138 699bfa3 3dffc7e f4f2601 385d930 699bfa3 3dffc7e 699bfa3 3dffc7e 699bfa3 17c0138 3dffc7e 699bfa3 17c0138 699bfa3 17c0138 699bfa3 385d930 f4f2601 385d930 f4f2601 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
import torch
import time
import psutil
import os
# --- FastAPI Imports ---
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
# CONFIGURATION
MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
# Global variables for model and tokenizer
model = None
tokenizer = None
load_status = "π Initializing..."
def load_model():
global model, tokenizer, load_status
try:
print(f"Loading tokenizer from {TOKENIZER_ID}...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
print(f"Loading GGUF weights from {MODEL_ID}...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
gguf_file=GGUF_FILE,
torch_dtype=torch.float32,
device_map="cpu"
)
load_status = "β
Model Loaded Successfully"
print(load_status)
except Exception as e:
load_status = f"β Error: {str(e)}"
print(load_status)
# Start loading in the background
Thread(target=load_model, daemon=True).start()
def get_stats():
vm = psutil.virtual_memory()
return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / {vm.total / 1024**3:.1f}GB"
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# GRADIO CHAT GENERATOR (For the UI)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def chat(message, history):
if model is None:
yield "Model is still loading or failed to load. Check status.", load_status
return
prompt = f"<ο½begin_of_sentenceο½><ο½Userο½>{message}<ο½Assistantο½><think>\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=1024,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
start_time = time.time()
generated_text = ""
token_count = 0
for new_text in streamer:
generated_text += new_text
token_count += 1
elapsed = time.time() - start_time
tps = token_count / elapsed if elapsed > 0 else 0
stats = f"β±οΈ {elapsed:.1f}s | β‘ {tps:.2f} t/s | {get_stats()} | {load_status}"
yield generated_text, stats
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# π DeepSeek-R1 CPU Dashboard + API")
with gr.Row():
with gr.Column(scale=4):
chatbot = gr.Chatbot(label="Response Console", height=500)
msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...")
with gr.Column(scale=1):
stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}")
gr.Markdown("---")
clear = gr.Button("Clear Chat")
def respond(message, chat_history):
return "", chat_history + [[message, ""]]
def stream_bot(chat_history):
user_input = chat_history[-1][0]
for content, stats in chat(user_input, chat_history[:-1]):
chat_history[-1][1] = content
yield chat_history, stats
msg.submit(respond, [msg, chatbot], [msg, chatbot]).then(
stream_bot, chatbot, [chatbot, stats_box]
)
clear.click(lambda: None, None, chatbot, queue=False)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# FASTAPI APPLICATION (The Bridge API)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
app = FastAPI(title="DeepSeek API Bridge")
class ChatRequest(BaseModel):
message: str
system: str = ""
@app.post("/chat")
def api_chat(req: ChatRequest):
"""
This endpoint catches the JSON payload from the Perspective Engine
and processes it through DeepSeek-R1 synchronously.
"""
if model is None:
raise HTTPException(status_code=503, detail="Model is still loading into RAM.")
# Combine the Engine's structured JSON prompt with the actual instruction
combined_prompt = f"{req.system}\n\n{req.message}".strip()
# Format exactly as DeepSeek expects
prompt = f"<ο½begin_of_sentenceο½><ο½Userο½>{combined_prompt}<ο½Assistantο½><think>\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=1024,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
# Strip the input prompt out of the generated tokens
input_length = inputs.input_ids.shape[1]
generated_tokens = outputs[0][input_length:]
response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
print("\n[API] Responded to Perspective Engine constraint query.")
return {"response": response_text}
# Mount the Gradio UI onto the FastAPI app
app = gr.mount_gradio_app(app, demo, path="/")
if __name__ == "__main__":
print("\nπ Starting DeepSeek Server on port 7860...")
print(" UI available at: http://0.0.0.0:7860/")
print(" API available at: http://0.0.0.0:7860/chat\n")
uvicorn.run(app, host="0.0.0.0", port=7860) |