ggerganov
/

presets

Model card Files Files and versions

xet

Community

ggerganov HF Staff commited on 14 days ago

Commit

eb3be97

verified ·

1 Parent(s): b66007a

presets : add 32GB

Browse files

Files changed (2) hide show

preset-128GB.ini +0 -32
preset-32GB.ini +99 -0

preset-128GB.ini CHANGED Viewed

@@ -86,38 +86,6 @@ top-p                = 0.95
 top-k                = 0
 temp                 = 1.0
-[qwen3.6-35b-a3b-hf]
-hf                   = ggml-org/Qwen3.6-35B-A3B-GGUF:Q8_0
-ctx-size             = 262144
-batch-size           = 2048
-ubatch-size          = 2048
-alias                = qwen3.6-35b-a3b-hf
-chat-template-kwargs = {"enable_thinking": false}
-[qwen3.6-27b-hf]
-hf                   = ggml-org/Qwen3.6-27B-GGUF:Q8_0
-ctx-size             = 262144
-batch-size           = 2048
-ubatch-size          = 2048
-alias                = qwen3.6-27b-hf
-chat-template-kwargs = {"enable_thinking": false}
-[qwen3.6-35b-a3b-hf-think]
-hf                   = ggml-org/Qwen3.6-35B-A3B-GGUF:Q8_0
-ctx-size             = 262144
-batch-size           = 2048
-ubatch-size          = 2048
-alias                = qwen3.6-35b-a3b-hf-think
-chat-template-kwargs = {"preserve_thinking": true}
-[qwen3.6-27b-hf-think]
-hf                   = ggml-org/Qwen3.6-27B-GGUF:Q8_0
-ctx-size             = 262144
-batch-size           = 2048
-ubatch-size          = 2048
-alias                = qwen3.6-27b-hf-think
-chat-template-kwargs = {"preserve_thinking": true}
 [qwen3.6-35b-a3b-hf-think-spec]
 hf                   = ggml-org/Qwen3.6-35B-A3B-GGUF:Q8_0
 spec-draft-hf        = ggml-org/Qwen3.5-0.8B-GGUF:Q8_0

 top-k                = 0
 temp                 = 1.0
 [qwen3.6-35b-a3b-hf-think-spec]
 hf                   = ggml-org/Qwen3.6-35B-A3B-GGUF:Q8_0
 spec-draft-hf        = ggml-org/Qwen3.5-0.8B-GGUF:Q8_0

preset-32GB.ini ADDED Viewed

	@@ -0,0 +1,99 @@

+# uses:
+# - llama.cpp webui
+# - llama.vim
+# - pi
+#
+# target machines with 32GB VRAM:
+# - Apple MacBook
+# - NVIDIA RTX 5090
+[*]
+ctx-size             = 0
+mmap                 = 1
+kv-unified           = 1
+parallel             = 4
+spec-default         = 1
+[gpt-oss-20b-hf]
+hf                   = ggml-org/gpt-oss-20b-GGUF
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 2048
+top-p                = 1.0
+top-k                = 0
+min-p                = 0.01
+temp                 = 1.0
+chat-template-kwargs = {"reasoning_effort": "high"}
+[gpt-oss-20b-hf-low]
+hf                   = ggml-org/gpt-oss-20b-GGUF
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 2048
+top-p                = 1.0
+top-k                = 0
+min-p                = 0.01
+temp                 = 1.0
+chat-template-kwargs = {"reasoning_effort": "low"}
+[gpt-oss-20b-hf-medium]
+hf                   = ggml-org/gpt-oss-20b-GGUF
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 2048
+top-p                = 1.0
+top-k                = 0
+min-p                = 0.01
+temp                 = 1.0
+chat-template-kwargs = {"reasoning_effort": "medium"}
+[gpt-oss-20b-hf-high]
+hf                   = ggml-org/gpt-oss-20b-GGUF
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 2048
+top-p                = 1.0
+top-k                = 0
+min-p                = 0.01
+temp                 = 1.0
+chat-template-kwargs = {"reasoning_effort": "high"}
+[qwen2.5-coder-7b-hf-fim]
+hf                   = ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
+ctx-size             = 65536
+cache-reuse          = 256
+batch-size           = 1024
+ubatch-size          = 1024
+alias                = fim,infill
+# uses a dedicated, pure 4-bit drafter
+[qwen3.6-35b-a3b-hf-think-mtp]
+hf                   = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:Q4_K_M
+spec-type            = draft-mtp
+spec-draft-n-max     = 3
+spec-draft-hf        = ggml-org/Qwen3.6-35B-A3B-GGUF
+spec-draft-model     = mtp-Qwen3.6-35B-A3B-Q4_0.gguf
+temp                 = 0.6
+min-p                = 0.05
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 512
+alias                = qwen3.6-35b-a3b-hf-think-mtp,inst
+chat-template-kwargs = {"preserve_thinking": true}
+# uses a dedicated, pure 4-bit drafter
+[qwen3.6-27b-hf-think-mtp]
+hf                   = unsloth/Qwen3.6-27B-MTP-GGUF:Q4_K_M
+spec-type            = draft-mtp
+spec-draft-n-max     = 3
+spec-draft-hf        = ggml-org/Qwen3.6-27B-GGUF
+spec-draft-model     = mtp-Qwen3.6-27B-Q4_0.gguf
+temp                 = 0.6
+min-p                = 0.05
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 2048
+alias                = qwen3.6-27b-hf-think-mtp,pi
+reasoning-budget     = 4096
+reasoning-budget-message = ... I am thinking for too long and cannot make a decision. I will now explain the problem to the user and ask them for advice.
+chat-template-kwargs = {"preserve_thinking": true}