ggerganov HF Staff commited on
Commit
eb3be97
·
verified ·
1 Parent(s): b66007a

presets : add 32GB

Browse files
Files changed (2) hide show
  1. preset-128GB.ini +0 -32
  2. preset-32GB.ini +99 -0
preset-128GB.ini CHANGED
@@ -86,38 +86,6 @@ top-p = 0.95
86
  top-k = 0
87
  temp = 1.0
88
 
89
- [qwen3.6-35b-a3b-hf]
90
- hf = ggml-org/Qwen3.6-35B-A3B-GGUF:Q8_0
91
- ctx-size = 262144
92
- batch-size = 2048
93
- ubatch-size = 2048
94
- alias = qwen3.6-35b-a3b-hf
95
- chat-template-kwargs = {"enable_thinking": false}
96
-
97
- [qwen3.6-27b-hf]
98
- hf = ggml-org/Qwen3.6-27B-GGUF:Q8_0
99
- ctx-size = 262144
100
- batch-size = 2048
101
- ubatch-size = 2048
102
- alias = qwen3.6-27b-hf
103
- chat-template-kwargs = {"enable_thinking": false}
104
-
105
- [qwen3.6-35b-a3b-hf-think]
106
- hf = ggml-org/Qwen3.6-35B-A3B-GGUF:Q8_0
107
- ctx-size = 262144
108
- batch-size = 2048
109
- ubatch-size = 2048
110
- alias = qwen3.6-35b-a3b-hf-think
111
- chat-template-kwargs = {"preserve_thinking": true}
112
-
113
- [qwen3.6-27b-hf-think]
114
- hf = ggml-org/Qwen3.6-27B-GGUF:Q8_0
115
- ctx-size = 262144
116
- batch-size = 2048
117
- ubatch-size = 2048
118
- alias = qwen3.6-27b-hf-think
119
- chat-template-kwargs = {"preserve_thinking": true}
120
-
121
  [qwen3.6-35b-a3b-hf-think-spec]
122
  hf = ggml-org/Qwen3.6-35B-A3B-GGUF:Q8_0
123
  spec-draft-hf = ggml-org/Qwen3.5-0.8B-GGUF:Q8_0
 
86
  top-k = 0
87
  temp = 1.0
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  [qwen3.6-35b-a3b-hf-think-spec]
90
  hf = ggml-org/Qwen3.6-35B-A3B-GGUF:Q8_0
91
  spec-draft-hf = ggml-org/Qwen3.5-0.8B-GGUF:Q8_0
preset-32GB.ini ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # uses:
2
+ # - llama.cpp webui
3
+ # - llama.vim
4
+ # - pi
5
+ #
6
+ # target machines with 32GB VRAM:
7
+ # - Apple MacBook
8
+ # - NVIDIA RTX 5090
9
+
10
+ [*]
11
+ ctx-size = 0
12
+ mmap = 1
13
+ kv-unified = 1
14
+ parallel = 4
15
+ spec-default = 1
16
+
17
+ [gpt-oss-20b-hf]
18
+ hf = ggml-org/gpt-oss-20b-GGUF
19
+ ctx-size = 262144
20
+ batch-size = 2048
21
+ ubatch-size = 2048
22
+ top-p = 1.0
23
+ top-k = 0
24
+ min-p = 0.01
25
+ temp = 1.0
26
+ chat-template-kwargs = {"reasoning_effort": "high"}
27
+
28
+ [gpt-oss-20b-hf-low]
29
+ hf = ggml-org/gpt-oss-20b-GGUF
30
+ ctx-size = 262144
31
+ batch-size = 2048
32
+ ubatch-size = 2048
33
+ top-p = 1.0
34
+ top-k = 0
35
+ min-p = 0.01
36
+ temp = 1.0
37
+ chat-template-kwargs = {"reasoning_effort": "low"}
38
+
39
+ [gpt-oss-20b-hf-medium]
40
+ hf = ggml-org/gpt-oss-20b-GGUF
41
+ ctx-size = 262144
42
+ batch-size = 2048
43
+ ubatch-size = 2048
44
+ top-p = 1.0
45
+ top-k = 0
46
+ min-p = 0.01
47
+ temp = 1.0
48
+ chat-template-kwargs = {"reasoning_effort": "medium"}
49
+
50
+ [gpt-oss-20b-hf-high]
51
+ hf = ggml-org/gpt-oss-20b-GGUF
52
+ ctx-size = 262144
53
+ batch-size = 2048
54
+ ubatch-size = 2048
55
+ top-p = 1.0
56
+ top-k = 0
57
+ min-p = 0.01
58
+ temp = 1.0
59
+ chat-template-kwargs = {"reasoning_effort": "high"}
60
+
61
+ [qwen2.5-coder-7b-hf-fim]
62
+ hf = ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
63
+ ctx-size = 65536
64
+ cache-reuse = 256
65
+ batch-size = 1024
66
+ ubatch-size = 1024
67
+ alias = fim,infill
68
+
69
+ # uses a dedicated, pure 4-bit drafter
70
+ [qwen3.6-35b-a3b-hf-think-mtp]
71
+ hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:Q4_K_M
72
+ spec-type = draft-mtp
73
+ spec-draft-n-max = 3
74
+ spec-draft-hf = ggml-org/Qwen3.6-35B-A3B-GGUF
75
+ spec-draft-model = mtp-Qwen3.6-35B-A3B-Q4_0.gguf
76
+ temp = 0.6
77
+ min-p = 0.05
78
+ ctx-size = 262144
79
+ batch-size = 2048
80
+ ubatch-size = 512
81
+ alias = qwen3.6-35b-a3b-hf-think-mtp,inst
82
+ chat-template-kwargs = {"preserve_thinking": true}
83
+
84
+ # uses a dedicated, pure 4-bit drafter
85
+ [qwen3.6-27b-hf-think-mtp]
86
+ hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q4_K_M
87
+ spec-type = draft-mtp
88
+ spec-draft-n-max = 3
89
+ spec-draft-hf = ggml-org/Qwen3.6-27B-GGUF
90
+ spec-draft-model = mtp-Qwen3.6-27B-Q4_0.gguf
91
+ temp = 0.6
92
+ min-p = 0.05
93
+ ctx-size = 262144
94
+ batch-size = 2048
95
+ ubatch-size = 2048
96
+ alias = qwen3.6-27b-hf-think-mtp,pi
97
+ reasoning-budget = 4096
98
+ reasoning-budget-message = ... I am thinking for too long and cannot make a decision. I will now explain the problem to the user and ask them for advice.
99
+ chat-template-kwargs = {"preserve_thinking": true}