dod-llm-server / app.py
elismasilva's picture
first commit
b022028
# Dynamic import & Mock system for Hugging Face 'spaces' package
try:
import spaces
has_spaces = True
except ImportError:
has_spaces = False
class spaces:
@staticmethod
def GPU(duration=None):
def decorator(f):
return f
return decorator
import os
import json
import multiprocessing
# Load .env locally if present
from dotenv import load_dotenv
load_dotenv()
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Download GGUF Model on startup from Hugging Face Hub
def download_nemotron_gguf():
repo_id = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF"
filename = "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"
local_dir = "./models"
os.makedirs(local_dir, exist_ok=True)
return hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False)
MODEL_PATH = download_nemotron_gguf()
# Smart Environment Detection for GPU layers offloading
if os.environ.get("SPACE_ID"):
GPU_LAYERS = -1
PORT=7860
CPU_THREADS = 2
else:
GPU_LAYERS = -1 if os.name == "nt" else 0
PORT=7880
IP_ADDRESS="0.0.0.0"
# Local Windows/Linux uses half of available CPU cores
CPU_THREADS = max(1, (multiprocessing.cpu_count() or 4) // 2)
global_llm = None
# Default sandbox prompt templates
SANDBOX_SYS_PROMPT = """You are "DOD-UNO-BOT", an AI game agent playing a software engineering themed UNO game.
Analyze the active card, hand, and server metrics (Resolution and Panic) to decide your next strategic move."""
SANDBOX_USER_PAYLOAD = '{"active_card": {"stack": "red"}, "metrics": {"resolution": 40, "panic": 20}, "hand": [{"index": 0, "stack": "red", "playable": true}]}'
# --- SECURE GPU RUNNER METHOD ---
@spaces.GPU(duration=60)
def gpu_inference_runner(system_prompt, user_payload, temperature, max_tokens, grammar_schema=None):
global global_llm
if global_llm is None:
print(f"Loading DOD LLM Engine: {MODEL_PATH} (GPU Layers: {GPU_LAYERS}, Threads: {CPU_THREADS})", flush=True)
global_llm = Llama(
model_path=MODEL_PATH,
n_gpu_layers=GPU_LAYERS,
verbose=True,
# Context and Batch Tuning
n_ctx=3072, # Optimized context window
n_batch=512, # Standard batch size for high-speed prompt ingestion
# Thread Mapping (Optimized dynamically to match physical environment cores)
n_threads=CPU_THREADS,
n_threads_batch=CPU_THREADS,
# Memory Safeguards
use_mlock=False,
use_mmap=True, # FIX: Must be True on cloud filesystems to prevent heavy I/O disk bottlenecks!
flash_attn=True,
# Advanced KV Cache Quantization
# 8 represents GGML_TYPE_Q8_0 (8-bit quantization for Key/Value cache)
type_k=8, # Quantize Key cache to 8-bit, reducing bandwidth pressure by 50%
type_v=8,
)
try:
kwargs = {
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_payload}
],
"temperature": float(temperature),
"max_tokens": int(max_tokens)
}
if grammar_schema:
kwargs["response_format"] = {
"type": "json_object",
"schema": grammar_schema
}
response = global_llm.create_chat_completion(**kwargs)
return response["choices"][0]["message"]["content"]
except Exception as e:
raise RuntimeError(f"Llama engine crash: {str(e)}")
# --- MANUAL TEST BENCH INTERFACES ---
def ui_test_inference(api_key, system_prompt, user_payload, temperature, grammar_schema=None):
"""Gradio handler to manually test the GPU model, verifying the secret key entered on the screen."""
expected_token = os.environ.get("LLM_API_KEY")
if expected_token and api_key != expected_token:
return "❌ Error: Unauthorized. The LLM_API_KEY token you entered is invalid!"
parsed_schema = None
if grammar_schema:
try:
if isinstance(grammar_schema, str):
parsed_schema = json.loads(grammar_schema)
else:
parsed_schema = grammar_schema
except Exception:
pass
try:
result = gpu_inference_runner(system_prompt, user_payload, temperature, 200, parsed_schema)
return result
except Exception as e:
return f"❌ Execution Error: {str(e)}"
# Define the local UI elements
with gr.Blocks() as demo:
gr.Markdown("# 🚀 DOD UNO - Dedicated GPU Inference Node")
gr.Markdown("Secure, hardware-accelerated serverless API endpoint backing DOD UNO Game Server.")
with gr.Tab("🔧 API Test Bench"):
gr.Markdown("### Validate the GPU Model manually by entering the secret API key:")
grammar_input = gr.Textbox(visible=False, value="")
with gr.Row():
api_key_input = gr.Textbox(
label="LLM_API_KEY (Token)",
type="password",
placeholder="Paste your secret handshake key here..."
)
with gr.Row():
sys_prompt_input = gr.Textbox(
label="System Prompt",
value=SANDBOX_SYS_PROMPT,
lines=4
)
user_payload_input = gr.Textbox(
label="User Payload (JSON / Text)",
value=SANDBOX_USER_PAYLOAD,
lines=4
)
with gr.Row():
temp_slider = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.1,
step=0.1,
label="Temperature"
)
test_btn = gr.Button("⚡ Run GPU Inference", variant="primary")
output_box = gr.Textbox(
label="Inference Result (JSON Output)",
lines=6,
placeholder="Result will appear here..."
)
test_btn.click(
fn=ui_test_inference,
inputs=[api_key_input, sys_prompt_input, user_payload_input, temp_slider, grammar_input],
outputs=[output_box],
api_name="generate_inference"
)
# Launch instance
demo.launch(server_name="0.0.0.0", server_port=PORT)