# Dynamic import & Mock system for Hugging Face 'spaces' package try: import spaces has_spaces = True except ImportError: has_spaces = False class spaces: @staticmethod def GPU(duration=None): def decorator(f): return f return decorator import os import json import multiprocessing # Load .env locally if present from dotenv import load_dotenv load_dotenv() import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Download GGUF Model on startup from Hugging Face Hub def download_nemotron_gguf(): repo_id = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF" filename = "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf" local_dir = "./models" os.makedirs(local_dir, exist_ok=True) return hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False) MODEL_PATH = download_nemotron_gguf() # Smart Environment Detection for GPU layers offloading if os.environ.get("SPACE_ID"): GPU_LAYERS = -1 PORT=7860 CPU_THREADS = 2 else: GPU_LAYERS = -1 if os.name == "nt" else 0 PORT=7880 IP_ADDRESS="0.0.0.0" # Local Windows/Linux uses half of available CPU cores CPU_THREADS = max(1, (multiprocessing.cpu_count() or 4) // 2) global_llm = None # Default sandbox prompt templates SANDBOX_SYS_PROMPT = """You are "DOD-UNO-BOT", an AI game agent playing a software engineering themed UNO game. Analyze the active card, hand, and server metrics (Resolution and Panic) to decide your next strategic move.""" SANDBOX_USER_PAYLOAD = '{"active_card": {"stack": "red"}, "metrics": {"resolution": 40, "panic": 20}, "hand": [{"index": 0, "stack": "red", "playable": true}]}' # --- SECURE GPU RUNNER METHOD --- @spaces.GPU(duration=60) def gpu_inference_runner(system_prompt, user_payload, temperature, max_tokens, grammar_schema=None): global global_llm if global_llm is None: print(f"Loading DOD LLM Engine: {MODEL_PATH} (GPU Layers: {GPU_LAYERS}, Threads: {CPU_THREADS})", flush=True) global_llm = Llama( model_path=MODEL_PATH, n_gpu_layers=GPU_LAYERS, verbose=True, # Context and Batch Tuning n_ctx=3072, # Optimized context window n_batch=512, # Standard batch size for high-speed prompt ingestion # Thread Mapping (Optimized dynamically to match physical environment cores) n_threads=CPU_THREADS, n_threads_batch=CPU_THREADS, # Memory Safeguards use_mlock=False, use_mmap=True, # FIX: Must be True on cloud filesystems to prevent heavy I/O disk bottlenecks! flash_attn=True, # Advanced KV Cache Quantization # 8 represents GGML_TYPE_Q8_0 (8-bit quantization for Key/Value cache) type_k=8, # Quantize Key cache to 8-bit, reducing bandwidth pressure by 50% type_v=8, ) try: kwargs = { "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_payload} ], "temperature": float(temperature), "max_tokens": int(max_tokens) } if grammar_schema: kwargs["response_format"] = { "type": "json_object", "schema": grammar_schema } response = global_llm.create_chat_completion(**kwargs) return response["choices"][0]["message"]["content"] except Exception as e: raise RuntimeError(f"Llama engine crash: {str(e)}") # --- MANUAL TEST BENCH INTERFACES --- def ui_test_inference(api_key, system_prompt, user_payload, temperature, grammar_schema=None): """Gradio handler to manually test the GPU model, verifying the secret key entered on the screen.""" expected_token = os.environ.get("LLM_API_KEY") if expected_token and api_key != expected_token: return "❌ Error: Unauthorized. The LLM_API_KEY token you entered is invalid!" parsed_schema = None if grammar_schema: try: if isinstance(grammar_schema, str): parsed_schema = json.loads(grammar_schema) else: parsed_schema = grammar_schema except Exception: pass try: result = gpu_inference_runner(system_prompt, user_payload, temperature, 200, parsed_schema) return result except Exception as e: return f"❌ Execution Error: {str(e)}" # Define the local UI elements with gr.Blocks() as demo: gr.Markdown("# 🚀 DOD UNO - Dedicated GPU Inference Node") gr.Markdown("Secure, hardware-accelerated serverless API endpoint backing DOD UNO Game Server.") with gr.Tab("🔧 API Test Bench"): gr.Markdown("### Validate the GPU Model manually by entering the secret API key:") grammar_input = gr.Textbox(visible=False, value="") with gr.Row(): api_key_input = gr.Textbox( label="LLM_API_KEY (Token)", type="password", placeholder="Paste your secret handshake key here..." ) with gr.Row(): sys_prompt_input = gr.Textbox( label="System Prompt", value=SANDBOX_SYS_PROMPT, lines=4 ) user_payload_input = gr.Textbox( label="User Payload (JSON / Text)", value=SANDBOX_USER_PAYLOAD, lines=4 ) with gr.Row(): temp_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature" ) test_btn = gr.Button("⚡ Run GPU Inference", variant="primary") output_box = gr.Textbox( label="Inference Result (JSON Output)", lines=6, placeholder="Result will appear here..." ) test_btn.click( fn=ui_test_inference, inputs=[api_key_input, sys_prompt_input, user_payload_input, temp_slider, grammar_input], outputs=[output_box], api_name="generate_inference" ) # Launch instance demo.launch(server_name="0.0.0.0", server_port=PORT)