import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread import torch import time import psutil import os # --- FastAPI Imports --- from fastapi import FastAPI, HTTPException from pydantic import BaseModel import uvicorn # CONFIGURATION MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF" GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf" TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # Global variables for model and tokenizer model = None tokenizer = None load_status = "πŸ”„ Initializing..." def load_model(): global model, tokenizer, load_status try: print(f"Loading tokenizer from {TOKENIZER_ID}...") tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID) print(f"Loading GGUF weights from {MODEL_ID}...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, gguf_file=GGUF_FILE, torch_dtype=torch.float32, device_map="cpu" ) load_status = "βœ… Model Loaded Successfully" print(load_status) except Exception as e: load_status = f"❌ Error: {str(e)}" print(load_status) # Start loading in the background Thread(target=load_model, daemon=True).start() def get_stats(): vm = psutil.virtual_memory() return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / {vm.total / 1024**3:.1f}GB" # ───────────────────────────────────────────────────────────── # GRADIO CHAT GENERATOR (For the UI) # ───────────────────────────────────────────────────────────── def chat(message, history): if model is None: yield "Model is still loading or failed to load. Check status.", load_status return prompt = f"<|begin_of_sentence|><|User|>{message}<|Assistant|>\n" inputs = tokenizer(prompt, return_tensors="pt").to("cpu") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( inputs, streamer=streamer, max_new_tokens=1024, do_sample=False, pad_token_id=tokenizer.eos_token_id ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() start_time = time.time() generated_text = "" token_count = 0 for new_text in streamer: generated_text += new_text token_count += 1 elapsed = time.time() - start_time tps = token_count / elapsed if elapsed > 0 else 0 stats = f"⏱️ {elapsed:.1f}s | ⚑ {tps:.2f} t/s | {get_stats()} | {load_status}" yield generated_text, stats with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# πŸš€ DeepSeek-R1 CPU Dashboard + API") with gr.Row(): with gr.Column(scale=4): chatbot = gr.Chatbot(label="Response Console", height=500) msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...") with gr.Column(scale=1): stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}") gr.Markdown("---") clear = gr.Button("Clear Chat") def respond(message, chat_history): return "", chat_history + [[message, ""]] def stream_bot(chat_history): user_input = chat_history[-1][0] for content, stats in chat(user_input, chat_history[:-1]): chat_history[-1][1] = content yield chat_history, stats msg.submit(respond, [msg, chatbot], [msg, chatbot]).then( stream_bot, chatbot, [chatbot, stats_box] ) clear.click(lambda: None, None, chatbot, queue=False) # ───────────────────────────────────────────────────────────── # FASTAPI APPLICATION (The Bridge API) # ───────────────────────────────────────────────────────────── app = FastAPI(title="DeepSeek API Bridge") class ChatRequest(BaseModel): message: str system: str = "" @app.post("/chat") def api_chat(req: ChatRequest): """ This endpoint catches the JSON payload from the Perspective Engine and processes it through DeepSeek-R1 synchronously. """ if model is None: raise HTTPException(status_code=503, detail="Model is still loading into RAM.") # Combine the Engine's structured JSON prompt with the actual instruction combined_prompt = f"{req.system}\n\n{req.message}".strip() # Format exactly as DeepSeek expects prompt = f"<|begin_of_sentence|><|User|>{combined_prompt}<|Assistant|>\n" inputs = tokenizer(prompt, return_tensors="pt").to("cpu") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=1024, do_sample=False, pad_token_id=tokenizer.eos_token_id ) # Strip the input prompt out of the generated tokens input_length = inputs.input_ids.shape[1] generated_tokens = outputs[0][input_length:] response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) print("\n[API] Responded to Perspective Engine constraint query.") return {"response": response_text} # Mount the Gradio UI onto the FastAPI app app = gr.mount_gradio_app(app, demo, path="/") if __name__ == "__main__": print("\n🌐 Starting DeepSeek Server on port 7860...") print(" UI available at: http://0.0.0.0:7860/") print(" API available at: http://0.0.0.0:7860/chat\n") uvicorn.run(app, host="0.0.0.0", port=7860)