Spaces:

build-small-hackathon
/

dod-llm-server

Running on Zero

File size: 6,580 Bytes

# Dynamic import & Mock system for Hugging Face 'spaces' package
try:
    import spaces
    has_spaces = True
except ImportError:
    has_spaces = False
    class spaces:
        @staticmethod
        def GPU(duration=None):
            def decorator(f):
                return f
            return decorator
import os
import json
import multiprocessing

# Load .env locally if present
from dotenv import load_dotenv
load_dotenv()

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Download GGUF Model on startup from Hugging Face Hub
def download_nemotron_gguf():
    repo_id = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF"
    filename = "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"
    local_dir = "./models"
    os.makedirs(local_dir, exist_ok=True)
    return hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False)

MODEL_PATH = download_nemotron_gguf()

# Smart Environment Detection for GPU layers offloading
if os.environ.get("SPACE_ID"):
    GPU_LAYERS = -1
    PORT=7860        
    CPU_THREADS = 2
else:
    GPU_LAYERS = -1 if os.name == "nt" else 0
    PORT=7880
    IP_ADDRESS="0.0.0.0"
    # Local Windows/Linux uses half of available CPU cores
    CPU_THREADS = max(1, (multiprocessing.cpu_count() or 4) // 2)

global_llm = None

# Default sandbox prompt templates
SANDBOX_SYS_PROMPT = """You are "DOD-UNO-BOT", an AI game agent playing a software engineering themed UNO game.
Analyze the active card, hand, and server metrics (Resolution and Panic) to decide your next strategic move."""

SANDBOX_USER_PAYLOAD = '{"active_card": {"stack": "red"}, "metrics": {"resolution": 40, "panic": 20}, "hand": [{"index": 0, "stack": "red", "playable": true}]}'

# --- SECURE GPU RUNNER METHOD ---
@spaces.GPU(duration=60)
def gpu_inference_runner(system_prompt, user_payload, temperature, max_tokens, grammar_schema=None):
    global global_llm

    if global_llm is None:
        print(f"Loading DOD LLM Engine: {MODEL_PATH} (GPU Layers: {GPU_LAYERS}, Threads: {CPU_THREADS})", flush=True)
        global_llm = Llama(
            model_path=MODEL_PATH,
            n_gpu_layers=GPU_LAYERS,
            verbose=True, 
            
            # Context and Batch Tuning
            n_ctx=3072,             # Optimized context window
            n_batch=512,            # Standard batch size for high-speed prompt ingestion
            
            # Thread Mapping (Optimized dynamically to match physical environment cores)
            n_threads=CPU_THREADS,  
            n_threads_batch=CPU_THREADS,
            
            # Memory Safeguards
            use_mlock=False,        
            use_mmap=True,          # FIX: Must be True on cloud filesystems to prevent heavy I/O disk bottlenecks!
            flash_attn=True,
            
            # Advanced KV Cache Quantization
            # 8 represents GGML_TYPE_Q8_0 (8-bit quantization for Key/Value cache)
            type_k=8,               # Quantize Key cache to 8-bit, reducing bandwidth pressure by 50%
            type_v=8,       
        )

    try:
        kwargs = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_payload}
            ],
            "temperature": float(temperature),
            "max_tokens": int(max_tokens)
        }
        if grammar_schema:
            kwargs["response_format"] = {
                "type": "json_object",
                "schema": grammar_schema
            }
            
        response = global_llm.create_chat_completion(**kwargs)
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        raise RuntimeError(f"Llama engine crash: {str(e)}")

# --- MANUAL TEST BENCH INTERFACES ---
def ui_test_inference(api_key, system_prompt, user_payload, temperature, grammar_schema=None):
    """Gradio handler to manually test the GPU model, verifying the secret key entered on the screen."""
    expected_token = os.environ.get("LLM_API_KEY")
    if expected_token and api_key != expected_token:
        return "❌ Error: Unauthorized. The LLM_API_KEY token you entered is invalid!"
    
    parsed_schema = None
    if grammar_schema:
        try:
            if isinstance(grammar_schema, str):
                parsed_schema = json.loads(grammar_schema)
            else:
                parsed_schema = grammar_schema
        except Exception:
            pass
            
    try:        
        result = gpu_inference_runner(system_prompt, user_payload, temperature, 200, parsed_schema)
        return result
    except Exception as e:
        return f"❌ Execution Error: {str(e)}"

# Define the local UI elements
with gr.Blocks() as demo:
    gr.Markdown("# 🚀 DOD UNO - Dedicated GPU Inference Node")
    gr.Markdown("Secure, hardware-accelerated serverless API endpoint backing DOD UNO Game Server.")
    
    with gr.Tab("🔧 API Test Bench"):
        gr.Markdown("### Validate the GPU Model manually by entering the secret API key:")
        grammar_input = gr.Textbox(visible=False, value="")
        with gr.Row():
            api_key_input = gr.Textbox(
                label="LLM_API_KEY (Token)", 
                type="password", 
                placeholder="Paste your secret handshake key here..."
            )
            
        with gr.Row():
            sys_prompt_input = gr.Textbox(
                label="System Prompt", 
                value=SANDBOX_SYS_PROMPT, 
                lines=4
            )
            user_payload_input = gr.Textbox(
                label="User Payload (JSON / Text)", 
                value=SANDBOX_USER_PAYLOAD, 
                lines=4
            )
            
        with gr.Row():
            temp_slider = gr.Slider(
                minimum=0.1, 
                maximum=1.0, 
                value=0.1, 
                step=0.1, 
                label="Temperature"
            )
            test_btn = gr.Button("⚡ Run GPU Inference", variant="primary")
            
        output_box = gr.Textbox(
            label="Inference Result (JSON Output)", 
            lines=6, 
            placeholder="Result will appear here..."
        )
        
        test_btn.click(
            fn=ui_test_inference,
            inputs=[api_key_input, sys_prompt_input, user_payload_input, temp_slider, grammar_input],
            outputs=[output_box],            
            api_name="generate_inference" 
        )

# Launch instance
demo.launch(server_name="0.0.0.0", server_port=PORT)