File size: 6,580 Bytes
b022028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed804aa
b022028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed804aa
b022028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed804aa
b022028
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Dynamic import & Mock system for Hugging Face 'spaces' package
try:
    import spaces
    has_spaces = True
except ImportError:
    has_spaces = False
    class spaces:
        @staticmethod
        def GPU(duration=None):
            def decorator(f):
                return f
            return decorator
import os
import json
import multiprocessing

# Load .env locally if present
from dotenv import load_dotenv
load_dotenv()

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Download GGUF Model on startup from Hugging Face Hub
def download_nemotron_gguf():
    repo_id = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF"
    filename = "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"
    local_dir = "./models"
    os.makedirs(local_dir, exist_ok=True)
    return hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False)

MODEL_PATH = download_nemotron_gguf()

# Smart Environment Detection for GPU layers offloading
if os.environ.get("SPACE_ID"):
    GPU_LAYERS = -1
    PORT=7860        
    CPU_THREADS = 2
else:
    GPU_LAYERS = -1 if os.name == "nt" else 0
    PORT=7880
    IP_ADDRESS="0.0.0.0"
    # Local Windows/Linux uses half of available CPU cores
    CPU_THREADS = max(1, (multiprocessing.cpu_count() or 4) // 2)

global_llm = None

# Default sandbox prompt templates
SANDBOX_SYS_PROMPT = """You are "DOD-UNO-BOT", an AI game agent playing a software engineering themed UNO game.
Analyze the active card, hand, and server metrics (Resolution and Panic) to decide your next strategic move."""

SANDBOX_USER_PAYLOAD = '{"active_card": {"stack": "red"}, "metrics": {"resolution": 40, "panic": 20}, "hand": [{"index": 0, "stack": "red", "playable": true}]}'

# --- SECURE GPU RUNNER METHOD ---
@spaces.GPU(duration=60)
def gpu_inference_runner(system_prompt, user_payload, temperature, max_tokens, grammar_schema=None):
    global global_llm

    if global_llm is None:
        print(f"Loading DOD LLM Engine: {MODEL_PATH} (GPU Layers: {GPU_LAYERS}, Threads: {CPU_THREADS})", flush=True)
        global_llm = Llama(
            model_path=MODEL_PATH,
            n_gpu_layers=GPU_LAYERS,
            verbose=True, 
            
            # Context and Batch Tuning
            n_ctx=3072,             # Optimized context window
            n_batch=512,            # Standard batch size for high-speed prompt ingestion
            
            # Thread Mapping (Optimized dynamically to match physical environment cores)
            n_threads=CPU_THREADS,  
            n_threads_batch=CPU_THREADS,
            
            # Memory Safeguards
            use_mlock=False,        
            use_mmap=True,          # FIX: Must be True on cloud filesystems to prevent heavy I/O disk bottlenecks!
            flash_attn=True,
            
            # Advanced KV Cache Quantization
            # 8 represents GGML_TYPE_Q8_0 (8-bit quantization for Key/Value cache)
            type_k=8,               # Quantize Key cache to 8-bit, reducing bandwidth pressure by 50%
            type_v=8,       
        )

    try:
        kwargs = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_payload}
            ],
            "temperature": float(temperature),
            "max_tokens": int(max_tokens)
        }
        if grammar_schema:
            kwargs["response_format"] = {
                "type": "json_object",
                "schema": grammar_schema
            }
            
        response = global_llm.create_chat_completion(**kwargs)
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        raise RuntimeError(f"Llama engine crash: {str(e)}")

# --- MANUAL TEST BENCH INTERFACES ---
def ui_test_inference(api_key, system_prompt, user_payload, temperature, grammar_schema=None):
    """Gradio handler to manually test the GPU model, verifying the secret key entered on the screen."""
    expected_token = os.environ.get("LLM_API_KEY")
    if expected_token and api_key != expected_token:
        return "❌ Error: Unauthorized. The LLM_API_KEY token you entered is invalid!"
    
    parsed_schema = None
    if grammar_schema:
        try:
            if isinstance(grammar_schema, str):
                parsed_schema = json.loads(grammar_schema)
            else:
                parsed_schema = grammar_schema
        except Exception:
            pass
            
    try:        
        result = gpu_inference_runner(system_prompt, user_payload, temperature, 200, parsed_schema)
        return result
    except Exception as e:
        return f"❌ Execution Error: {str(e)}"

# Define the local UI elements
with gr.Blocks() as demo:
    gr.Markdown("# 🚀 DOD UNO - Dedicated GPU Inference Node")
    gr.Markdown("Secure, hardware-accelerated serverless API endpoint backing DOD UNO Game Server.")
    
    with gr.Tab("🔧 API Test Bench"):
        gr.Markdown("### Validate the GPU Model manually by entering the secret API key:")
        grammar_input = gr.Textbox(visible=False, value="")
        with gr.Row():
            api_key_input = gr.Textbox(
                label="LLM_API_KEY (Token)", 
                type="password", 
                placeholder="Paste your secret handshake key here..."
            )
            
        with gr.Row():
            sys_prompt_input = gr.Textbox(
                label="System Prompt", 
                value=SANDBOX_SYS_PROMPT, 
                lines=4
            )
            user_payload_input = gr.Textbox(
                label="User Payload (JSON / Text)", 
                value=SANDBOX_USER_PAYLOAD, 
                lines=4
            )
            
        with gr.Row():
            temp_slider = gr.Slider(
                minimum=0.1, 
                maximum=1.0, 
                value=0.1, 
                step=0.1, 
                label="Temperature"
            )
            test_btn = gr.Button("⚡ Run GPU Inference", variant="primary")
            
        output_box = gr.Textbox(
            label="Inference Result (JSON Output)", 
            lines=6, 
            placeholder="Result will appear here..."
        )
        
        test_btn.click(
            fn=ui_test_inference,
            inputs=[api_key_input, sys_prompt_input, user_payload_input, temp_slider, grammar_input],
            outputs=[output_box],            
            api_name="generate_inference" 
        )

# Launch instance
demo.launch(server_name="0.0.0.0", server_port=PORT)