| [*] | |
| fit = off ; Disable automatic memory fitting | |
| ngl = 999 ; Full GPU offload | |
| ctk = q8_0 ; KV cache key quantization | |
| ctv = q8_0 ; KV cache value quantization | |
| fa = on ; Enable flash attention | |
| mlock = on ; Lock model in RAM | |
| np = 1 ; Parallel request batching | |
| kvu = off ; Unified KV cache buffer | |
| stop-timeout = 2 ; Force-kill child process after graceful shutdown timeout in seconds (default: 10) | |
| ; sleep-idle-seconds = 3600 ; Unload weights on child process | |
| b = 128 ; Logical maximum batch size (default: 2048) | |
| ub = 512 ; Physical maximum batch size (default: 512) | |
| [gpt-oss-20b-hf] | |
| hf = ggml-org/gpt-oss-20b-GGUF | |
| c = 131072 ; Context size in tokens for this model | |
| chat-template-kwargs = {"reasoning_effort": "high"} | |
| [gpt-oss-120b-hf] | |
| hf = ggml-org/gpt-oss-120b-GGUF | |
| c = 131072 | |
| chat-template-kwargs = {"reasoning_effort": "high"} | |
| [qwen3-coder-30b-hf] | |
| hf = ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF | |
| temp = 0.7 | |
| top-p = 0.8 | |
| top-k = 20 | |
| min-p = 0 | |
| c = 262144 | |