File size: 6,022 Bytes
385d930
17c0138
 
385d930
699bfa3
 
3dffc7e
385d930
f4f2601
 
 
 
 
3dffc7e
 
 
f4f2601
385d930
3dffc7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4f2601
3dffc7e
 
 
 
 
f4f2601
385d930
699bfa3
 
f4f2601
385d930
f4f2601
 
 
699bfa3
3dffc7e
 
 
 
17c0138
385d930
 
3dffc7e
17c0138
 
 
 
3dffc7e
17c0138
 
 
 
 
 
699bfa3
17c0138
 
385d930
17c0138
 
 
 
 
3dffc7e
17c0138
699bfa3
3dffc7e
f4f2601
385d930
699bfa3
 
3dffc7e
 
699bfa3
3dffc7e
 
 
699bfa3
17c0138
3dffc7e
699bfa3
17c0138
 
 
 
 
699bfa3
17c0138
 
699bfa3
 
385d930
f4f2601
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385d930
f4f2601
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
import torch
import time
import psutil
import os

# --- FastAPI Imports ---
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn

# CONFIGURATION
MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Global variables for model and tokenizer
model = None
tokenizer = None
load_status = "πŸ”„ Initializing..."

def load_model():
    global model, tokenizer, load_status
    try:
        print(f"Loading tokenizer from {TOKENIZER_ID}...")
        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
        
        print(f"Loading GGUF weights from {MODEL_ID}...")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            gguf_file=GGUF_FILE,
            torch_dtype=torch.float32,
            device_map="cpu"
        )
        load_status = "βœ… Model Loaded Successfully"
        print(load_status)
    except Exception as e:
        load_status = f"❌ Error: {str(e)}"
        print(load_status)

# Start loading in the background
Thread(target=load_model, daemon=True).start()

def get_stats():
    vm = psutil.virtual_memory()
    return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / {vm.total / 1024**3:.1f}GB"

# ─────────────────────────────────────────────────────────────
# GRADIO CHAT GENERATOR (For the UI)
# ─────────────────────────────────────────────────────────────
def chat(message, history):
    if model is None:
        yield "Model is still loading or failed to load. Check status.", load_status
        return

    prompt = f"<|begin_of_sentence|><|User|>{message}<|Assistant|><think>\n"
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    start_time = time.time()
    generated_text = ""
    token_count = 0
    
    for new_text in streamer:
        generated_text += new_text
        token_count += 1
        elapsed = time.time() - start_time
        tps = token_count / elapsed if elapsed > 0 else 0
        stats = f"⏱️ {elapsed:.1f}s | ⚑ {tps:.2f} t/s | {get_stats()} | {load_status}"
        yield generated_text, stats

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸš€ DeepSeek-R1 CPU Dashboard + API")
    
    with gr.Row():
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(label="Response Console", height=500)
            msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...")
        with gr.Column(scale=1):
            stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}")
            gr.Markdown("---")
            clear = gr.Button("Clear Chat")

    def respond(message, chat_history):
        return "", chat_history + [[message, ""]]

    def stream_bot(chat_history):
        user_input = chat_history[-1][0]
        for content, stats in chat(user_input, chat_history[:-1]):
            chat_history[-1][1] = content
            yield chat_history, stats

    msg.submit(respond, [msg, chatbot], [msg, chatbot]).then(
        stream_bot, chatbot, [chatbot, stats_box]
    )
    clear.click(lambda: None, None, chatbot, queue=False)


# ─────────────────────────────────────────────────────────────
# FASTAPI APPLICATION (The Bridge API)
# ─────────────────────────────────────────────────────────────
app = FastAPI(title="DeepSeek API Bridge")

class ChatRequest(BaseModel):
    message: str
    system: str = ""

@app.post("/chat")
def api_chat(req: ChatRequest):
    """
    This endpoint catches the JSON payload from the Perspective Engine
    and processes it through DeepSeek-R1 synchronously.
    """
    if model is None:
        raise HTTPException(status_code=503, detail="Model is still loading into RAM.")

    # Combine the Engine's structured JSON prompt with the actual instruction
    combined_prompt = f"{req.system}\n\n{req.message}".strip()
    
    # Format exactly as DeepSeek expects
    prompt = f"<|begin_of_sentence|><|User|>{combined_prompt}<|Assistant|><think>\n"
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Strip the input prompt out of the generated tokens
    input_length = inputs.input_ids.shape[1]
    generated_tokens = outputs[0][input_length:]
    response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    print("\n[API] Responded to Perspective Engine constraint query.")
    
    return {"response": response_text}

# Mount the Gradio UI onto the FastAPI app
app = gr.mount_gradio_app(app, demo, path="/")

if __name__ == "__main__":
    print("\n🌐 Starting DeepSeek Server on port 7860...")
    print("   UI  available at: http://0.0.0.0:7860/")
    print("   API available at: http://0.0.0.0:7860/chat\n")
    uvicorn.run(app, host="0.0.0.0", port=7860)