jedisct1 commited on
Commit
84ae63d
·
verified ·
1 Parent(s): 16de59a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.gguf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MiMo-V2.5-coder-Q2-00001-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2896b618eeb9d0e27bca57a3cf5ecd8520e19f4df296464c5db2fcf61b1a5adb
3
+ size 8831584256
MiMo-V2.5-coder-Q2-00002-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d609955e1801579c62d6299cb49725696b8e6555f988aee10e9342c5d4c488f
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00003-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a255ea8fce9fe2cde49203b938027927c96d3d0044ba9beb348f2792fc9c9a17
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00004-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:404c6404f2b99d67a8b1a611003032e36e6047d25c1f9f93c73212afb92e638c
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00005-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1623ed43c88610124cdb0e731356f0ff2f47d26e5d2a6d3793c7d93f0b2fcbb
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00006-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abc369764edb625ace3ad95ad522e8d2bfa0a5f3a4b6fc4759812be79e543011
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00007-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7678e83adedf0903ae584173bb2b01e36a24e0faa6082246626a7bfdd315d8fb
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00008-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d85c683831f2b9942b817409b3967805a78e20d70598d43a642d6b12d0cccf2
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00009-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d8493b982984c35a887f2a69839d5be3454a5574a96579b662f0cb1f94d5f58
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00010-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89febfedaaaf5c5a2b576b9575594f6f732efb6e16895d025f16c3c3ac3bb877
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00011-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59fbcc70d59edad135e92f310c70055bd09224b1369a119a084121c47f7426ee
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00012-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8411d5242700a9acc72fb8eed3c1f7b9efe3d6cdf90091a7df7d6453f61786b
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00013-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d817b70895822965a6f929d9313376f5a46031d3165e88dccb637ec1c25faef8
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00014-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a328343a4e68c901f2fe31d993df408680b38bcabb4d114a327b92c14027c074
3
+ size 6996099840
MiMo-V2.5-coder-Q2-00015-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:394281ac7b020ef0acd253a535b222aac02b4d488f979e864d63e12fdc2d7bee
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00016-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0de886b54240c049d747cad7264425f213b2e94e954cac2be18457808d7a98d1
3
+ size 6996099872
README.md ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: XiaomiMiMo/MiMo-V2.5
4
+ language:
5
+ - en
6
+ library_name: llama.cpp
7
+ tags:
8
+ - gguf
9
+ - llama.cpp
10
+ - text-generation
11
+ - code
12
+ - coding
13
+ - tool-calling
14
+ - agent
15
+ - mixture-of-experts
16
+ - long-context
17
+ pipeline_tag: text-generation
18
+ ---
19
+
20
+ # MiMo-V2.5 Coder Q2 v2 GGUF
21
+
22
+ This is a text-only GGUF build of [XiaomiMiMo/MiMo-V2.5](https://huggingface.co/XiaomiMiMo/MiMo-V2.5), tuned for coding and OpenAI-compatible tool calling on high-memory local machines.
23
+
24
+ The target system for this build is a 128 GB Apple Silicon machine. The default serving profile uses a 100,000-token context and asks llama.cpp to fit as much of the model as possible onto Metal while leaving enough headroom for KV cache and runtime buffers. Smaller-memory machines will likely need a smaller context, more CPU offload, or a smaller quant.
25
+
26
+ This is not a multimodal build. MiMo-V2.5 is an omnimodal checkpoint, but this GGUF contains the text model only. The vision and audio encoders are not included. MiMo's multi-token prediction blocks were also omitted because the current llama.cpp MiMo2 generation path does not use those blocks for normal inference.
27
+
28
+ ## Why this build exists
29
+
30
+ Public low-bit quants of very large MoE models can be surprisingly fragile: tool calls may become malformed, code may fail on small API details, and long answers can drift into repeated reasoning loops. This build was made to spend the limited Q2-class quality budget on the workloads where MiMo-V2.5 is most useful locally:
31
+
32
+ - coding in common systems and scripting languages
33
+ - web UI/component generation
34
+ - OpenAI-compatible tool calling
35
+ - Swival-style agent loops over real files and commands
36
+ - long English technical prompts
37
+
38
+ Chinese-language quality and multimodal behavior were not optimization targets.
39
+
40
+ ## How it was built
41
+
42
+ The source was the original `XiaomiMiMo/MiMo-V2.5` checkpoint, converted to GGUF with llama.cpp's native MiMo2 support. Conversion was text-only and omitted runtime-inactive MTP/NextN blocks, so memory is not spent on tensors that current llama.cpp MiMo2 inference does not execute.
43
+
44
+ The final artifact is a split `Q2_K_S` GGUF with an importance matrix built from English coding, debugging, tool-calling, shell, and agent prompts. The calibration mix was designed to make the quantizer preserve behavior that matters for developer workflows rather than generic chat breadth.
45
+
46
+ The build was iterative:
47
+
48
+ 1. Convert the original checkpoint to split BF16 GGUF.
49
+ 2. Produce a first low-bit coding/tool-use candidate.
50
+ 3. Test that candidate on executable coding tasks and Swival-style tool calls.
51
+ 4. Add calibration coverage for the failures that showed up in real tests.
52
+ 5. Rebuild the importance matrix from the expanded coding/tool-use prompt mix.
53
+ 6. Re-quantize with the final `Q2_K_S` recipe.
54
+
55
+ The calibration text is not required to use the model. It was a build-time tool for telling the quantizer which activations mattered most: code generation, code repair, shell-style work, JSON/tool-call formatting, and agent workflows over real files.
56
+
57
+ Quantization details:
58
+
59
+ - Quant type: `Q2_K_S`
60
+ - Importance matrix: coding and tool-calling focused
61
+ - Embeddings and output tensors kept at higher precision
62
+ - Attention and dense first-FFN tensors protected at higher precision
63
+ - MoE down-expert tensors kept at `Q3_K`
64
+ - Reported size: about 108,496.76 MiB, 2.95 BPW
65
+ - Split files: 16 GGUF shards
66
+
67
+ One tokenizer metadata fix is included: the base-vocabulary `</s>` token is marked as a control-looking token so llama.cpp does not warn at load time. MiMo's real EOS token remains `<|im_end|>`.
68
+
69
+ ## Why this recipe was chosen
70
+
71
+ The recipe is a compromise between quality and a hard practical limit: this model has to run locally on a 128 GB unified-memory machine. Higher-bit GGUFs of a model this large can exceed the useful memory envelope once KV cache, batching, Metal buffers, and the operating system are included.
72
+
73
+ The first plain `Q2_K` family candidate was small enough, but it was not reliable enough for tool calling. It malformed some Swival-style arguments and missed several conditional tools. The v2 recipe is larger, but it spends the extra space where it helped most:
74
+
75
+ - embeddings and output tensors stay higher precision because they are important for token identity and exact syntax
76
+ - attention tensors are protected because tool-call and code prompts are structure-heavy
77
+ - the dense first FFN is protected because early-layer representation quality matters disproportionately after heavy quantization
78
+ - MoE down-expert tensors use `Q3_K`, which was a better quality/memory tradeoff than pushing all expert down-projections lower
79
+
80
+ That is why this is still a Q2-class build, but not the smallest possible Q2 build.
81
+
82
+ ## Why it is good at coding
83
+
84
+ This quant was not chosen just because it fits in memory. It was iterated against executable tasks and then rebuilt with a stronger coding/tool-use importance matrix after early failures were identified.
85
+
86
+ The first low-bit pass exposed the kinds of issues that matter in practice: malformed tool-call arguments, brittle JavaScript Markdown parsing, incorrect Zig checked-addition APIs, and small C/C++/Go harness problems. Those failures were used to improve the calibration distribution and to validate that the final model can solve the tasks when the problem statement contains the same constraints a developer would normally give.
87
+
88
+ The final v2 artifact passed the local coding and web-design harness across:
89
+
90
+ - Swift
91
+ - JavaScript
92
+ - TypeScript through Deno
93
+ - Rust
94
+ - C
95
+ - C++
96
+ - Zig
97
+ - Python
98
+ - Perl
99
+ - Go
100
+ - static HTML/CSS
101
+
102
+ That harness writes complete model-generated files into isolated directories and validates them with local compilers, runtimes, or test runners. The current v2 run passed 11/11. The checks are intentionally practical rather than benchmark-like: they catch whether the generated code compiles, runs, and handles edge cases from the prompt.
103
+
104
+ It was also tested on framework-style frontend tasks. React, Vue, and Solid components were rendered server-side with Deno/npm tooling, including props, filtering behavior, accessible form markup, and summary text checks. The current v2 run passed 3/3.
105
+
106
+ The important point is not that these small harnesses prove universal coding ability. They prove that the quantization process did not destroy the details that low-bit models often lose first: exact exported names, balanced parsing logic, checked arithmetic APIs, command/tool argument shapes, and framework-specific rendering conventions.
107
+
108
+ ## Tool-calling validation
109
+
110
+ Tool calling was tested with Swival because Swival exercises OpenAI-style tools in realistic agent loops instead of only checking toy single-call examples.
111
+
112
+ Validation included:
113
+
114
+ - a broad synthetic selector suite covering the current Swival tool surface
115
+ - real one-shot Swival tasks over files, grep, command execution, fetches, image input, skills, metaskills, snapshots, todos, and subagents
116
+ - a real `/goal` run that required the model to complete work and call `complete_goal`
117
+
118
+ The current v2 results were:
119
+
120
+ - Swival all-tools selector: 22/22
121
+ - real Swival one-shot suite: 10/10 with zero failed tool calls
122
+ - real Swival goal-mode `complete_goal`: passed with exactly one successful `complete_goal` call
123
+
124
+ A separate repetition-loop guard was also run on long coding and web prompts. The current v2 artifact passed 4/4, with no repeated-tail failures.
125
+
126
+ These are local validation results, not public benchmark scores. They are included so users know what this quant was optimized for and what kinds of regressions were actively checked.
127
+
128
+ Compared with the earlier local candidate, the v2 build fixed the key practical failures: the broad Swival selector went from 18/22 to 22/22, the coding/web suite reached 11/11 after task prompts were aligned with the validators, and the real Swival task suite completed with zero failed tool calls. This is why the package is labeled `v2`.
129
+
130
+ ## Serving with llama.cpp
131
+
132
+ Recent llama.cpp builds should be able to load the repo directly:
133
+
134
+ ```sh
135
+ llama-server \
136
+ -hf jedisct1/MiMo-V2.5-coder-Q2-v2 \
137
+ --host 127.0.0.1 \
138
+ --port 8080 \
139
+ --ctx-size 100000 \
140
+ --parallel 1 \
141
+ --batch-size 512 \
142
+ --ubatch-size 128 \
143
+ --threads 12 \
144
+ --threads-batch 18 \
145
+ --prio 0 \
146
+ --poll 80 \
147
+ --flash-attn on \
148
+ --jinja \
149
+ --fit on \
150
+ --fit-target 4096 \
151
+ --fit-ctx 100000 \
152
+ --gpu-layers auto \
153
+ --cache-type-k f16 \
154
+ --cache-type-v f16 \
155
+ --reasoning off
156
+ ```
157
+
158
+ If your llama.cpp build does not auto-select the split GGUF set, pass the first shard explicitly:
159
+
160
+ ```sh
161
+ llama-server \
162
+ -hf jedisct1/MiMo-V2.5-coder-Q2-v2 \
163
+ --hf-file MiMo-V2.5-coder-Q2-00001-of-00016.gguf \
164
+ --ctx-size 100000 \
165
+ --flash-attn on \
166
+ --jinja \
167
+ --reasoning off
168
+ ```
169
+
170
+ If you cloned or downloaded the repository locally, you can use the helper script:
171
+
172
+ ```sh
173
+ ./run-server.sh
174
+ ```
175
+
176
+ The helper script loads the first GGUF shard next to it and uses the same default serving profile.
177
+
178
+ Default settings:
179
+
180
+ ```sh
181
+ MIMO_CTX=100000
182
+ MIMO_FIT_CTX=100000
183
+ MIMO_FIT_TARGET=4096
184
+ MIMO_BATCH=512
185
+ MIMO_UBATCH=128
186
+ MIMO_REASONING=off
187
+ MIMO_CPU_MOE=0
188
+ ```
189
+
190
+ For more memory headroom, use CPU-MoE mode:
191
+
192
+ ```sh
193
+ MIMO_CPU_MOE=1 MIMO_FIT_TARGET=32768 MIMO_BATCH=128 MIMO_UBATCH=64 ./run-server.sh
194
+ ```
195
+
196
+ That mode is slower, especially during long prompt prefill, but it leaves more Metal memory available.
197
+
198
+ You can point the script at a specific server binary:
199
+
200
+ ```sh
201
+ LLAMA_SERVER=/path/to/llama-server ./run-server.sh
202
+ ```
203
+
204
+ ## Tool-calling tips
205
+
206
+ - Disable reasoning output with `--reasoning off` or `MIMO_REASONING=off`.
207
+ - Send tool schemas from the client rather than enabling llama.cpp built-in tools.
208
+ - Set `parallel_tool_calls` to `false` if your client supports it.
209
+ - Avoid forcing `tool_choice: required`; in testing, that made malformed calls more likely.
210
+ - Use a client that supports OpenAI-compatible tool calls cleanly. Swival was the main validation client for this build.
211
+
212
+ ## License
213
+
214
+ The upstream `XiaomiMiMo/MiMo-V2.5` model card declares the MIT license. This derived GGUF is provided with the same license metadata.
run-server.sh ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
5
+ LLAMA_SERVER=${LLAMA_SERVER:-llama-server}
6
+
7
+ if ! command -v "$LLAMA_SERVER" >/dev/null 2>&1; then
8
+ echo "llama-server was not found. Install llama.cpp or set LLAMA_SERVER=/path/to/llama-server." >&2
9
+ exit 1
10
+ fi
11
+
12
+ if [[ -n "${MIMO_MODEL:-}" ]]; then
13
+ MODEL=$MIMO_MODEL
14
+ else
15
+ shopt -s nullglob
16
+ CANDIDATES=("$SCRIPT_DIR"/MiMo-V2.5-coder-Q2-00001-of-*.gguf)
17
+ shopt -u nullglob
18
+
19
+ if [[ ${#CANDIDATES[@]} -eq 0 ]]; then
20
+ echo "No first GGUF shard found next to run-server.sh." >&2
21
+ exit 1
22
+ fi
23
+
24
+ MODEL=${CANDIDATES[0]}
25
+ fi
26
+
27
+ ARGS=(
28
+ --model "$MODEL"
29
+ --host "${MIMO_HOST:-127.0.0.1}"
30
+ --port "${MIMO_PORT:-8080}"
31
+ --ctx-size "${MIMO_CTX:-100000}"
32
+ --parallel "${MIMO_PARALLEL:-1}"
33
+ --batch-size "${MIMO_BATCH:-512}"
34
+ --ubatch-size "${MIMO_UBATCH:-128}"
35
+ --threads "${MIMO_THREADS:-12}"
36
+ --threads-batch "${MIMO_THREADS_BATCH:-18}"
37
+ --prio "${MIMO_PRIO:-0}"
38
+ --poll "${MIMO_POLL:-80}"
39
+ --flash-attn on
40
+ --jinja
41
+ --fit "${MIMO_FIT:-on}"
42
+ --fit-target "${MIMO_FIT_TARGET:-4096}"
43
+ --fit-ctx "${MIMO_FIT_CTX:-100000}"
44
+ --gpu-layers "${MIMO_GPU_LAYERS:-auto}"
45
+ --cache-type-k "${MIMO_CACHE_K:-f16}"
46
+ --cache-type-v "${MIMO_CACHE_V:-f16}"
47
+ --reasoning "${MIMO_REASONING:-off}"
48
+ )
49
+
50
+ if [[ "${MIMO_CPU_MOE:-0}" == "1" ]]; then
51
+ ARGS+=(--cpu-moe)
52
+ fi
53
+
54
+ if [[ -n "${MIMO_DEVICE:-}" ]]; then
55
+ ARGS+=(--device "$MIMO_DEVICE")
56
+ fi
57
+
58
+ if [[ -n "${MIMO_TOOLS:-}" ]]; then
59
+ ARGS+=(--tools "$MIMO_TOOLS")
60
+ fi
61
+
62
+ exec "$LLAMA_SERVER" "${ARGS[@]}" "$@"