Small_llm

Running

App Files Files Community

Small_llm / app2.py

everydaytok

Rename app.py to app2.py

17ef328 verified 12 days ago

raw

history blame contribute delete

6.02 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	from threading import Thread
	import torch
	import time
	import psutil
	import os

	# --- FastAPI Imports ---
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	import uvicorn

	# CONFIGURATION
	MODEL_ID = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
	GGUF_FILE = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
	TOKENIZER_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

	# Global variables for model and tokenizer
	model = None
	tokenizer = None
	load_status = "🔄 Initializing..."

	def load_model():
	global model, tokenizer, load_status
	try:
	print(f"Loading tokenizer from {TOKENIZER_ID}...")
	tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)

	print(f"Loading GGUF weights from {MODEL_ID}...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	gguf_file=GGUF_FILE,
	torch_dtype=torch.float32,
	device_map="cpu"
	)
	load_status = "✅ Model Loaded Successfully"
	print(load_status)
	except Exception as e:
	load_status = f"❌ Error: {str(e)}"
	print(load_status)

	# Start loading in the background
	Thread(target=load_model, daemon=True).start()

	def get_stats():
	vm = psutil.virtual_memory()
	return f"RAM: {vm.percent}% \| {vm.used / 10243:.1f}GB / {vm.total / 10243:.1f}GB"

	# ─────────────────────────────────────────────────────────────
	# GRADIO CHAT GENERATOR (For the UI)
	# ─────────────────────────────────────────────────────────────
	def chat(message, history):
	if model is None:
	yield "Model is still loading or failed to load. Check status.", load_status
	return

	prompt = f"<｜begin_of_sentence｜><｜User｜>{message}<｜Assistant｜><think>\n"
	inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = dict(
	inputs,
	streamer=streamer,
	max_new_tokens=1024,
	do_sample=False,
	pad_token_id=tokenizer.eos_token_id
	)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	start_time = time.time()
	generated_text = ""
	token_count = 0

	for new_text in streamer:
	generated_text += new_text
	token_count += 1
	elapsed = time.time() - start_time
	tps = token_count / elapsed if elapsed > 0 else 0
	stats = f"⏱️ {elapsed:.1f}s \| ⚡ {tps:.2f} t/s \| {get_stats()} \| {load_status}"
	yield generated_text, stats

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🚀 DeepSeek-R1 CPU Dashboard + API")

	with gr.Row():
	with gr.Column(scale=4):
	chatbot = gr.Chatbot(label="Response Console", height=500)
	msg = gr.Textbox(label="Math/JSON Prompt", placeholder="Type here and press Enter...")
	with gr.Column(scale=1):
	stats_box = gr.Markdown(f"### Live Metrics\n{get_stats()}\n{load_status}")
	gr.Markdown("---")
	clear = gr.Button("Clear Chat")

	def respond(message, chat_history):
	return "", chat_history + [[message, ""]]

	def stream_bot(chat_history):
	user_input = chat_history[-1][0]
	for content, stats in chat(user_input, chat_history[:-1]):
	chat_history[-1][1] = content
	yield chat_history, stats

	msg.submit(respond, [msg, chatbot], [msg, chatbot]).then(
	stream_bot, chatbot, [chatbot, stats_box]
	)
	clear.click(lambda: None, None, chatbot, queue=False)


	# ─────────────────────────────────────────────────────────────
	# FASTAPI APPLICATION (The Bridge API)
	# ─────────────────────────────────────────────────────────────
	app = FastAPI(title="DeepSeek API Bridge")

	class ChatRequest(BaseModel):
	message: str
	system: str = ""

	@app.post("/chat")
	def api_chat(req: ChatRequest):
	"""
	This endpoint catches the JSON payload from the Perspective Engine
	and processes it through DeepSeek-R1 synchronously.
	"""
	if model is None:
	raise HTTPException(status_code=503, detail="Model is still loading into RAM.")

	# Combine the Engine's structured JSON prompt with the actual instruction
	combined_prompt = f"{req.system}\n\n{req.message}".strip()

	# Format exactly as DeepSeek expects
	prompt = f"<｜begin_of_sentence｜><｜User｜>{combined_prompt}<｜Assistant｜><think>\n"
	inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=1024,
	do_sample=False,
	pad_token_id=tokenizer.eos_token_id
	)

	# Strip the input prompt out of the generated tokens
	input_length = inputs.input_ids.shape[1]
	generated_tokens = outputs[0][input_length:]
	response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

	print("\n[API] Responded to Perspective Engine constraint query.")

	return {"response": response_text}

	# Mount the Gradio UI onto the FastAPI app
	app = gr.mount_gradio_app(app, demo, path="/")

	if __name__ == "__main__":
	print("\n🌐 Starting DeepSeek Server on port 7860...")
	print(" UI available at: http://0.0.0.0:7860/")
	print(" API available at: http://0.0.0.0:7860/chat\n")
	uvicorn.run(app, host="0.0.0.0", port=7860)