Spaces:

build-small-hackathon
/

dod-llm-server

Running on Zero

App Files Files Community

dod-llm-server / app.py

elismasilva

first commit

b022028 1 day ago

raw

history blame contribute delete

6.58 kB

	# Dynamic import & Mock system for Hugging Face 'spaces' package
	try:
	import spaces
	has_spaces = True
	except ImportError:
	has_spaces = False
	class spaces:
	@staticmethod
	def GPU(duration=None):
	def decorator(f):
	return f
	return decorator
	import os
	import json
	import multiprocessing

	# Load .env locally if present
	from dotenv import load_dotenv
	load_dotenv()

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# Download GGUF Model on startup from Hugging Face Hub
	def download_nemotron_gguf():
	repo_id = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF"
	filename = "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"
	local_dir = "./models"
	os.makedirs(local_dir, exist_ok=True)
	return hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False)

	MODEL_PATH = download_nemotron_gguf()

	# Smart Environment Detection for GPU layers offloading
	if os.environ.get("SPACE_ID"):
	GPU_LAYERS = -1
	PORT=7860
	CPU_THREADS = 2
	else:
	GPU_LAYERS = -1 if os.name == "nt" else 0
	PORT=7880
	IP_ADDRESS="0.0.0.0"
	# Local Windows/Linux uses half of available CPU cores
	CPU_THREADS = max(1, (multiprocessing.cpu_count() or 4) // 2)

	global_llm = None

	# Default sandbox prompt templates
	SANDBOX_SYS_PROMPT = """You are "DOD-UNO-BOT", an AI game agent playing a software engineering themed UNO game.
	Analyze the active card, hand, and server metrics (Resolution and Panic) to decide your next strategic move."""

	SANDBOX_USER_PAYLOAD = '{"active_card": {"stack": "red"}, "metrics": {"resolution": 40, "panic": 20}, "hand": [{"index": 0, "stack": "red", "playable": true}]}'

	# --- SECURE GPU RUNNER METHOD ---
	@spaces.GPU(duration=60)
	def gpu_inference_runner(system_prompt, user_payload, temperature, max_tokens, grammar_schema=None):
	global global_llm

	if global_llm is None:
	print(f"Loading DOD LLM Engine: {MODEL_PATH} (GPU Layers: {GPU_LAYERS}, Threads: {CPU_THREADS})", flush=True)
	global_llm = Llama(
	model_path=MODEL_PATH,
	n_gpu_layers=GPU_LAYERS,
	verbose=True,

	# Context and Batch Tuning
	n_ctx=3072, # Optimized context window
	n_batch=512, # Standard batch size for high-speed prompt ingestion

	# Thread Mapping (Optimized dynamically to match physical environment cores)
	n_threads=CPU_THREADS,
	n_threads_batch=CPU_THREADS,

	# Memory Safeguards
	use_mlock=False,
	use_mmap=True, # FIX: Must be True on cloud filesystems to prevent heavy I/O disk bottlenecks!
	flash_attn=True,

	# Advanced KV Cache Quantization
	# 8 represents GGML_TYPE_Q8_0 (8-bit quantization for Key/Value cache)
	type_k=8, # Quantize Key cache to 8-bit, reducing bandwidth pressure by 50%
	type_v=8,
	)

	try:
	kwargs = {
	"messages": [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_payload}
	],
	"temperature": float(temperature),
	"max_tokens": int(max_tokens)
	}
	if grammar_schema:
	kwargs["response_format"] = {
	"type": "json_object",
	"schema": grammar_schema
	}

	response = global_llm.create_chat_completion(**kwargs)
	return response["choices"][0]["message"]["content"]
	except Exception as e:
	raise RuntimeError(f"Llama engine crash: {str(e)}")

	# --- MANUAL TEST BENCH INTERFACES ---
	def ui_test_inference(api_key, system_prompt, user_payload, temperature, grammar_schema=None):
	"""Gradio handler to manually test the GPU model, verifying the secret key entered on the screen."""
	expected_token = os.environ.get("LLM_API_KEY")
	if expected_token and api_key != expected_token:
	return "❌ Error: Unauthorized. The LLM_API_KEY token you entered is invalid!"

	parsed_schema = None
	if grammar_schema:
	try:
	if isinstance(grammar_schema, str):
	parsed_schema = json.loads(grammar_schema)
	else:
	parsed_schema = grammar_schema
	except Exception:
	pass

	try:
	result = gpu_inference_runner(system_prompt, user_payload, temperature, 200, parsed_schema)
	return result
	except Exception as e:
	return f"❌ Execution Error: {str(e)}"

	# Define the local UI elements
	with gr.Blocks() as demo:
	gr.Markdown("# 🚀 DOD UNO - Dedicated GPU Inference Node")
	gr.Markdown("Secure, hardware-accelerated serverless API endpoint backing DOD UNO Game Server.")

	with gr.Tab("🔧 API Test Bench"):
	gr.Markdown("### Validate the GPU Model manually by entering the secret API key:")
	grammar_input = gr.Textbox(visible=False, value="")
	with gr.Row():
	api_key_input = gr.Textbox(
	label="LLM_API_KEY (Token)",
	type="password",
	placeholder="Paste your secret handshake key here..."
	)

	with gr.Row():
	sys_prompt_input = gr.Textbox(
	label="System Prompt",
	value=SANDBOX_SYS_PROMPT,
	lines=4
	)
	user_payload_input = gr.Textbox(
	label="User Payload (JSON / Text)",
	value=SANDBOX_USER_PAYLOAD,
	lines=4
	)

	with gr.Row():
	temp_slider = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.1,
	step=0.1,
	label="Temperature"
	)
	test_btn = gr.Button("⚡ Run GPU Inference", variant="primary")

	output_box = gr.Textbox(
	label="Inference Result (JSON Output)",
	lines=6,
	placeholder="Result will appear here..."
	)

	test_btn.click(
	fn=ui_test_inference,
	inputs=[api_key_input, sys_prompt_input, user_payload_input, temp_slider, grammar_input],
	outputs=[output_box],
	api_name="generate_inference"
	)

	# Launch instance
	demo.launch(server_name="0.0.0.0", server_port=PORT)