Spaces:
Running on Zero
Running on Zero
File size: 6,580 Bytes
b022028 ed804aa b022028 ed804aa b022028 ed804aa b022028 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | # Dynamic import & Mock system for Hugging Face 'spaces' package
try:
import spaces
has_spaces = True
except ImportError:
has_spaces = False
class spaces:
@staticmethod
def GPU(duration=None):
def decorator(f):
return f
return decorator
import os
import json
import multiprocessing
# Load .env locally if present
from dotenv import load_dotenv
load_dotenv()
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Download GGUF Model on startup from Hugging Face Hub
def download_nemotron_gguf():
repo_id = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF"
filename = "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"
local_dir = "./models"
os.makedirs(local_dir, exist_ok=True)
return hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False)
MODEL_PATH = download_nemotron_gguf()
# Smart Environment Detection for GPU layers offloading
if os.environ.get("SPACE_ID"):
GPU_LAYERS = -1
PORT=7860
CPU_THREADS = 2
else:
GPU_LAYERS = -1 if os.name == "nt" else 0
PORT=7880
IP_ADDRESS="0.0.0.0"
# Local Windows/Linux uses half of available CPU cores
CPU_THREADS = max(1, (multiprocessing.cpu_count() or 4) // 2)
global_llm = None
# Default sandbox prompt templates
SANDBOX_SYS_PROMPT = """You are "DOD-UNO-BOT", an AI game agent playing a software engineering themed UNO game.
Analyze the active card, hand, and server metrics (Resolution and Panic) to decide your next strategic move."""
SANDBOX_USER_PAYLOAD = '{"active_card": {"stack": "red"}, "metrics": {"resolution": 40, "panic": 20}, "hand": [{"index": 0, "stack": "red", "playable": true}]}'
# --- SECURE GPU RUNNER METHOD ---
@spaces.GPU(duration=60)
def gpu_inference_runner(system_prompt, user_payload, temperature, max_tokens, grammar_schema=None):
global global_llm
if global_llm is None:
print(f"Loading DOD LLM Engine: {MODEL_PATH} (GPU Layers: {GPU_LAYERS}, Threads: {CPU_THREADS})", flush=True)
global_llm = Llama(
model_path=MODEL_PATH,
n_gpu_layers=GPU_LAYERS,
verbose=True,
# Context and Batch Tuning
n_ctx=3072, # Optimized context window
n_batch=512, # Standard batch size for high-speed prompt ingestion
# Thread Mapping (Optimized dynamically to match physical environment cores)
n_threads=CPU_THREADS,
n_threads_batch=CPU_THREADS,
# Memory Safeguards
use_mlock=False,
use_mmap=True, # FIX: Must be True on cloud filesystems to prevent heavy I/O disk bottlenecks!
flash_attn=True,
# Advanced KV Cache Quantization
# 8 represents GGML_TYPE_Q8_0 (8-bit quantization for Key/Value cache)
type_k=8, # Quantize Key cache to 8-bit, reducing bandwidth pressure by 50%
type_v=8,
)
try:
kwargs = {
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_payload}
],
"temperature": float(temperature),
"max_tokens": int(max_tokens)
}
if grammar_schema:
kwargs["response_format"] = {
"type": "json_object",
"schema": grammar_schema
}
response = global_llm.create_chat_completion(**kwargs)
return response["choices"][0]["message"]["content"]
except Exception as e:
raise RuntimeError(f"Llama engine crash: {str(e)}")
# --- MANUAL TEST BENCH INTERFACES ---
def ui_test_inference(api_key, system_prompt, user_payload, temperature, grammar_schema=None):
"""Gradio handler to manually test the GPU model, verifying the secret key entered on the screen."""
expected_token = os.environ.get("LLM_API_KEY")
if expected_token and api_key != expected_token:
return "❌ Error: Unauthorized. The LLM_API_KEY token you entered is invalid!"
parsed_schema = None
if grammar_schema:
try:
if isinstance(grammar_schema, str):
parsed_schema = json.loads(grammar_schema)
else:
parsed_schema = grammar_schema
except Exception:
pass
try:
result = gpu_inference_runner(system_prompt, user_payload, temperature, 200, parsed_schema)
return result
except Exception as e:
return f"❌ Execution Error: {str(e)}"
# Define the local UI elements
with gr.Blocks() as demo:
gr.Markdown("# 🚀 DOD UNO - Dedicated GPU Inference Node")
gr.Markdown("Secure, hardware-accelerated serverless API endpoint backing DOD UNO Game Server.")
with gr.Tab("🔧 API Test Bench"):
gr.Markdown("### Validate the GPU Model manually by entering the secret API key:")
grammar_input = gr.Textbox(visible=False, value="")
with gr.Row():
api_key_input = gr.Textbox(
label="LLM_API_KEY (Token)",
type="password",
placeholder="Paste your secret handshake key here..."
)
with gr.Row():
sys_prompt_input = gr.Textbox(
label="System Prompt",
value=SANDBOX_SYS_PROMPT,
lines=4
)
user_payload_input = gr.Textbox(
label="User Payload (JSON / Text)",
value=SANDBOX_USER_PAYLOAD,
lines=4
)
with gr.Row():
temp_slider = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.1,
step=0.1,
label="Temperature"
)
test_btn = gr.Button("⚡ Run GPU Inference", variant="primary")
output_box = gr.Textbox(
label="Inference Result (JSON Output)",
lines=6,
placeholder="Result will appear here..."
)
test_btn.click(
fn=ui_test_inference,
inputs=[api_key_input, sys_prompt_input, user_payload_input, temp_slider, grammar_input],
outputs=[output_box],
api_name="generate_inference"
)
# Launch instance
demo.launch(server_name="0.0.0.0", server_port=PORT)
|