presets / preset.ini
Serveurperso's picture
Update preset.ini
a47c3f9 verified
[*]
fit = off ; Disable automatic memory fitting
ngl = 999 ; Full GPU offload
ctk = q8_0 ; KV cache key quantization
ctv = q8_0 ; KV cache value quantization
fa = on ; Enable flash attention
mlock = on ; Lock model in RAM
np = 1 ; Parallel request batching
kvu = off ; Unified KV cache buffer
stop-timeout = 2 ; Force-kill child process after graceful shutdown timeout in seconds (default: 10)
; sleep-idle-seconds = 3600 ; Unload weights on child process
b = 128 ; Logical maximum batch size (default: 2048)
ub = 512 ; Physical maximum batch size (default: 512)
[gpt-oss-20b-hf]
hf = ggml-org/gpt-oss-20b-GGUF
c = 131072 ; Context size in tokens for this model
chat-template-kwargs = {"reasoning_effort": "high"}
[gpt-oss-120b-hf]
hf = ggml-org/gpt-oss-120b-GGUF
c = 131072
chat-template-kwargs = {"reasoning_effort": "high"}
[qwen3-coder-30b-hf]
hf = ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
temp = 0.7
top-p = 0.8
top-k = 20
min-p = 0
c = 262144