Spaces:
Running
Running
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| from threading import Thread | |
| import torch | |
| import time | |
| import psutil | |
| import os | |
| # --- FastAPI Imports --- | |
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| import uvicorn | |
| # CONFIGURATION | |
| MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF" | |
| GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf" | |
| TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" | |
| # Global variables for model and tokenizer | |
| model = None | |
| tokenizer = None | |
| load_status = "π Initializing..." | |
| def load_model(): | |
| global model, tokenizer, load_status | |
| try: | |
| print(f"Loading tokenizer from {TOKENIZER_ID}...") | |
| tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID) | |
| print(f"Loading GGUF weights from {MODEL_ID}...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| gguf_file=GGUF_FILE, | |
| torch_dtype=torch.float32, | |
| device_map="cpu" | |
| ) | |
| load_status = "β Model Loaded Successfully" | |
| print(load_status) | |
| except Exception as e: | |
| load_status = f"β Error: {str(e)}" | |
| print(load_status) | |
| # Start loading in the background | |
| Thread(target=load_model, daemon=True).start() | |
| def get_stats(): | |
| vm = psutil.virtual_memory() | |
| return f"RAM: {vm.percent}% | {vm.used / 1024**3:.1f}GB / {vm.total / 1024**3:.1f}GB" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GRADIO CHAT GENERATOR (For the UI) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chat(message, history): | |
| if model is None: | |
| yield "Model is still loading or failed to load. Check status.", load_status | |
| return | |
| prompt = f"<ο½begin_of_sentenceο½><ο½Userο½>{message}<ο½Assistantο½><think>\n" | |
| inputs = tokenizer(prompt, return_tensors="pt").to("cpu") | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| generation_kwargs = dict( | |
| inputs, | |
| streamer=streamer, | |
| max_new_tokens=1024, | |
| do_sample=False, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| start_time = time.time() | |
| generated_text = "" | |
| token_count = 0 | |
| for new_text in streamer: | |
| generated_text += new_text | |
| token_count += 1 | |
| elapsed = time.time() - start_time | |
| tps = token_count / elapsed if elapsed > 0 else 0 | |
| stats = f"β±οΈ {elapsed:.1f}s | β‘ {tps:.2f} t/s | {get_stats()} | {load_status}" | |
| yield generated_text, stats | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π DeepSeek-R1 CPU Dashboard + API") | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| chatbot = gr.Chatbot(label="Response Console", height=500) | |
| msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...") | |
| with gr.Column(scale=1): | |
| stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}") | |
| gr.Markdown("---") | |
| clear = gr.Button("Clear Chat") | |
| def respond(message, chat_history): | |
| return "", chat_history + [[message, ""]] | |
| def stream_bot(chat_history): | |
| user_input = chat_history[-1][0] | |
| for content, stats in chat(user_input, chat_history[:-1]): | |
| chat_history[-1][1] = content | |
| yield chat_history, stats | |
| msg.submit(respond, [msg, chatbot], [msg, chatbot]).then( | |
| stream_bot, chatbot, [chatbot, stats_box] | |
| ) | |
| clear.click(lambda: None, None, chatbot, queue=False) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FASTAPI APPLICATION (The Bridge API) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app = FastAPI(title="DeepSeek API Bridge") | |
| class ChatRequest(BaseModel): | |
| message: str | |
| system: str = "" | |
| def api_chat(req: ChatRequest): | |
| """ | |
| This endpoint catches the JSON payload from the Perspective Engine | |
| and processes it through DeepSeek-R1 synchronously. | |
| """ | |
| if model is None: | |
| raise HTTPException(status_code=503, detail="Model is still loading into RAM.") | |
| # Combine the Engine's structured JSON prompt with the actual instruction | |
| combined_prompt = f"{req.system}\n\n{req.message}".strip() | |
| # Format exactly as DeepSeek expects | |
| prompt = f"<ο½begin_of_sentenceο½><ο½Userο½>{combined_prompt}<ο½Assistantο½><think>\n" | |
| inputs = tokenizer(prompt, return_tensors="pt").to("cpu") | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=1024, | |
| do_sample=False, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| # Strip the input prompt out of the generated tokens | |
| input_length = inputs.input_ids.shape[1] | |
| generated_tokens = outputs[0][input_length:] | |
| response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
| print("\n[API] Responded to Perspective Engine constraint query.") | |
| return {"response": response_text} | |
| # Mount the Gradio UI onto the FastAPI app | |
| app = gr.mount_gradio_app(app, demo, path="/") | |
| if __name__ == "__main__": | |
| print("\nπ Starting DeepSeek Server on port 7860...") | |
| print(" UI available at: http://0.0.0.0:7860/") | |
| print(" API available at: http://0.0.0.0:7860/chat\n") | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |