Spaces:
Running on Zero
Running on Zero
| # Dynamic import & Mock system for Hugging Face 'spaces' package | |
| try: | |
| import spaces | |
| has_spaces = True | |
| except ImportError: | |
| has_spaces = False | |
| class spaces: | |
| def GPU(duration=None): | |
| def decorator(f): | |
| return f | |
| return decorator | |
| import os | |
| import json | |
| import multiprocessing | |
| # Load .env locally if present | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| # Download GGUF Model on startup from Hugging Face Hub | |
| def download_nemotron_gguf(): | |
| repo_id = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF" | |
| filename = "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf" | |
| local_dir = "./models" | |
| os.makedirs(local_dir, exist_ok=True) | |
| return hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False) | |
| MODEL_PATH = download_nemotron_gguf() | |
| # Smart Environment Detection for GPU layers offloading | |
| if os.environ.get("SPACE_ID"): | |
| GPU_LAYERS = -1 | |
| PORT=7860 | |
| CPU_THREADS = 2 | |
| else: | |
| GPU_LAYERS = -1 if os.name == "nt" else 0 | |
| PORT=7880 | |
| IP_ADDRESS="0.0.0.0" | |
| # Local Windows/Linux uses half of available CPU cores | |
| CPU_THREADS = max(1, (multiprocessing.cpu_count() or 4) // 2) | |
| global_llm = None | |
| # Default sandbox prompt templates | |
| SANDBOX_SYS_PROMPT = """You are "DOD-UNO-BOT", an AI game agent playing a software engineering themed UNO game. | |
| Analyze the active card, hand, and server metrics (Resolution and Panic) to decide your next strategic move.""" | |
| SANDBOX_USER_PAYLOAD = '{"active_card": {"stack": "red"}, "metrics": {"resolution": 40, "panic": 20}, "hand": [{"index": 0, "stack": "red", "playable": true}]}' | |
| # --- SECURE GPU RUNNER METHOD --- | |
| def gpu_inference_runner(system_prompt, user_payload, temperature, max_tokens, grammar_schema=None): | |
| global global_llm | |
| if global_llm is None: | |
| print(f"Loading DOD LLM Engine: {MODEL_PATH} (GPU Layers: {GPU_LAYERS}, Threads: {CPU_THREADS})", flush=True) | |
| global_llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_gpu_layers=GPU_LAYERS, | |
| verbose=True, | |
| # Context and Batch Tuning | |
| n_ctx=3072, # Optimized context window | |
| n_batch=512, # Standard batch size for high-speed prompt ingestion | |
| # Thread Mapping (Optimized dynamically to match physical environment cores) | |
| n_threads=CPU_THREADS, | |
| n_threads_batch=CPU_THREADS, | |
| # Memory Safeguards | |
| use_mlock=False, | |
| use_mmap=True, # FIX: Must be True on cloud filesystems to prevent heavy I/O disk bottlenecks! | |
| flash_attn=True, | |
| # Advanced KV Cache Quantization | |
| # 8 represents GGML_TYPE_Q8_0 (8-bit quantization for Key/Value cache) | |
| type_k=8, # Quantize Key cache to 8-bit, reducing bandwidth pressure by 50% | |
| type_v=8, | |
| ) | |
| try: | |
| kwargs = { | |
| "messages": [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_payload} | |
| ], | |
| "temperature": float(temperature), | |
| "max_tokens": int(max_tokens) | |
| } | |
| if grammar_schema: | |
| kwargs["response_format"] = { | |
| "type": "json_object", | |
| "schema": grammar_schema | |
| } | |
| response = global_llm.create_chat_completion(**kwargs) | |
| return response["choices"][0]["message"]["content"] | |
| except Exception as e: | |
| raise RuntimeError(f"Llama engine crash: {str(e)}") | |
| # --- MANUAL TEST BENCH INTERFACES --- | |
| def ui_test_inference(api_key, system_prompt, user_payload, temperature, grammar_schema=None): | |
| """Gradio handler to manually test the GPU model, verifying the secret key entered on the screen.""" | |
| expected_token = os.environ.get("LLM_API_KEY") | |
| if expected_token and api_key != expected_token: | |
| return "❌ Error: Unauthorized. The LLM_API_KEY token you entered is invalid!" | |
| parsed_schema = None | |
| if grammar_schema: | |
| try: | |
| if isinstance(grammar_schema, str): | |
| parsed_schema = json.loads(grammar_schema) | |
| else: | |
| parsed_schema = grammar_schema | |
| except Exception: | |
| pass | |
| try: | |
| result = gpu_inference_runner(system_prompt, user_payload, temperature, 200, parsed_schema) | |
| return result | |
| except Exception as e: | |
| return f"❌ Execution Error: {str(e)}" | |
| # Define the local UI elements | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🚀 DOD UNO - Dedicated GPU Inference Node") | |
| gr.Markdown("Secure, hardware-accelerated serverless API endpoint backing DOD UNO Game Server.") | |
| with gr.Tab("🔧 API Test Bench"): | |
| gr.Markdown("### Validate the GPU Model manually by entering the secret API key:") | |
| grammar_input = gr.Textbox(visible=False, value="") | |
| with gr.Row(): | |
| api_key_input = gr.Textbox( | |
| label="LLM_API_KEY (Token)", | |
| type="password", | |
| placeholder="Paste your secret handshake key here..." | |
| ) | |
| with gr.Row(): | |
| sys_prompt_input = gr.Textbox( | |
| label="System Prompt", | |
| value=SANDBOX_SYS_PROMPT, | |
| lines=4 | |
| ) | |
| user_payload_input = gr.Textbox( | |
| label="User Payload (JSON / Text)", | |
| value=SANDBOX_USER_PAYLOAD, | |
| lines=4 | |
| ) | |
| with gr.Row(): | |
| temp_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.1, | |
| step=0.1, | |
| label="Temperature" | |
| ) | |
| test_btn = gr.Button("⚡ Run GPU Inference", variant="primary") | |
| output_box = gr.Textbox( | |
| label="Inference Result (JSON Output)", | |
| lines=6, | |
| placeholder="Result will appear here..." | |
| ) | |
| test_btn.click( | |
| fn=ui_test_inference, | |
| inputs=[api_key_input, sys_prompt_input, user_payload_input, temp_slider, grammar_input], | |
| outputs=[output_box], | |
| api_name="generate_inference" | |
| ) | |
| # Launch instance | |
| demo.launch(server_name="0.0.0.0", server_port=PORT) | |