Serveurperso
/

presets

Model card Files Files and versions

presets / preset.ini

Serveurperso's picture

Update preset.ini

a47c3f9 verified 4 months ago

history blame contribute delete

1.15 kB

	[*]
	fit = off ; Disable automatic memory fitting
	ngl = 999 ; Full GPU offload
	ctk = q8_0 ; KV cache key quantization
	ctv = q8_0 ; KV cache value quantization
	fa = on ; Enable flash attention
	mlock = on ; Lock model in RAM
	np = 1 ; Parallel request batching
	kvu = off ; Unified KV cache buffer
	stop-timeout = 2 ; Force-kill child process after graceful shutdown timeout in seconds (default: 10)
	; sleep-idle-seconds = 3600 ; Unload weights on child process
	b = 128 ; Logical maximum batch size (default: 2048)
	ub = 512 ; Physical maximum batch size (default: 512)

	[gpt-oss-20b-hf]
	hf = ggml-org/gpt-oss-20b-GGUF
	c = 131072 ; Context size in tokens for this model
	chat-template-kwargs = {"reasoning_effort": "high"}

	[gpt-oss-120b-hf]
	hf = ggml-org/gpt-oss-120b-GGUF
	c = 131072
	chat-template-kwargs = {"reasoning_effort": "high"}

	[qwen3-coder-30b-hf]
	hf = ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
	temp = 0.7
	top-p = 0.8
	top-k = 20
	min-p = 0
	c = 262144