jedisct1 commited on
Commit
f7e8204
·
verified ·
1 Parent(s): 0be611b

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.gguf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MiMo-V2.5-coder-Q2-00001-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:400d2928a0863f8e338717bdfc9d826cae7049ff968a79ac7b1220ebb93552f8
3
+ size 8831584256
MiMo-V2.5-coder-Q2-00002-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3820d01f8aac9c26b0430d4c3feef68fb5d02b8304418f5a994de6359b9ba94
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00003-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1043df380f9859dd1f42eea9862fbfd5987c737a679dc57941c8e6b52535a8bd
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00004-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6834f524a2f2de1153a11ba5f9509bd6268a240f891bf2dc4d311c52ee3f3e56
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00005-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f50e2e0a565dc9dd67e45b6f1a6709bfe60e7f9918ef4e2e6f0ef300e43a98c
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00006-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88743a58cb0f6db6c96c26e8fdfc6973b2f73c4250d7b79b1cb11f4ff3f93999
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00007-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:472170a584ab7edbdf31bc8ccc8f998214de9aaf18355708c8df35b999c5f371
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00008-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22650e6f26ea15f6d76d467e910fe31ba46cc7423d83aaf73922d2599bac33a4
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00009-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df50089f1d5263d77e63ff6a6cc9856bf19b4d195cb6fc849eb109a3bbe867f5
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00010-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6affc7d53669b35aca07ae7b057ddb3508120f5676115fca4022ead75899002
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00011-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef1be676dcfea5791849cf67086e14b2d3b756d4d40abcdac179ce7316f5297b
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00012-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a14699f5394d85069574f49a93b6cc26643c3d3f6e7673e32bf756291ff02df2
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00013-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b45d32e0266cb277a6fff36e19a8174d4118975a0ae7c5233fef9efdf0cbb97d
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00014-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b24c23aa55751b4a095ffa5e90165e6bd868a155885ed838878eea0963fa640
3
+ size 6996099840
MiMo-V2.5-coder-Q2-00015-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c22efbfcbd2c4287944b2cdc9dd3473430ce6ed247ad83118d8fb32c6ddf7ad3
3
+ size 6996099872
MiMo-V2.5-coder-Q2-00016-of-00016.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35051041037ab14d47c7e72502d8b19bf52832134c164acbb72b9e96bfba219e
3
+ size 6996099872
README.md ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: XiaomiMiMo/MiMo-V2.5
4
+ language:
5
+ - en
6
+ library_name: llama.cpp
7
+ tags:
8
+ - gguf
9
+ - llama.cpp
10
+ - text-generation
11
+ - code
12
+ - tool-calling
13
+ - agent
14
+ - mixture-of-experts
15
+ - long-context
16
+ pipeline_tag: text-generation
17
+ ---
18
+
19
+ # MiMo-V2.5 Coder Q2 GGUF
20
+
21
+ This is a local, self-quantized GGUF build of [XiaomiMiMo/MiMo-V2.5](https://huggingface.co/XiaomiMiMo/MiMo-V2.5), tuned for coding and tool-calling on a 128 GB Apple Silicon M5 machine.
22
+
23
+ This quant was optimized for systems with 128 GB of memory. The default serving profile targets a 128 GB Apple Silicon machine and tries to keep the model practical at a 100,000-token context. Smaller-memory systems will likely need more aggressive CPU offload, a smaller context, or a different quant.
24
+
25
+ It is a text-only llama.cpp conversion of the MiMo language backbone. The original MiMo-V2.5 checkpoint is omnimodal, but this GGUF does not include the vision or audio encoders. The MiMo multi-token prediction blocks were also omitted during conversion because normal llama.cpp generation does not currently execute them for this model.
26
+
27
+ ## Files
28
+
29
+ The model is split into 16 GGUF shards:
30
+
31
+ ```text
32
+ MiMo-V2.5-coder-Q2-00001-of-00016.gguf
33
+ ...
34
+ MiMo-V2.5-coder-Q2-00016-of-00016.gguf
35
+ ```
36
+
37
+ Load the first shard. llama.cpp will find the remaining shards automatically.
38
+
39
+ ## Quantization
40
+
41
+ This artifact was quantized from the original XiaomiMiMo checkpoint, not from a third-party GGUF.
42
+
43
+ High-level summary:
44
+
45
+ - Quant type: `Q2_K_S`
46
+ - Importance matrix: coding and tool-calling focused
47
+ - Preserved higher precision for embeddings, output, attention, and the dense first FFN
48
+ - MoE down-expert tensors: `Q3_K`
49
+ - Reported quantized size: about 108,496.76 MiB at 2.95 BPW
50
+
51
+ One tokenizer metadata fix is included so llama.cpp does not warn about the base-vocab `</s>` token at load time. MiMo's actual EOS token remains `<|im_end|>`.
52
+
53
+ ## Importance Matrix
54
+
55
+ The importance matrix is what makes this quant more targeted than a generic low-bit conversion.
56
+
57
+ It was built from English coding and agent-style prompts: reading files, searching code, running shell commands, editing workflows, short code review tasks, and OpenAI-compatible tool calls. The tool-call calibration included realistic argument shapes such as bounded file reads, tail reads, grep context lines, and command arrays.
58
+
59
+ That means the quantization tries to spend its limited precision budget on weights that matter for coding and structured tool use. It is not a general-purpose multilingual calibration set, and it was not designed to preserve Chinese or multimodal quality.
60
+
61
+ ## Why This Quant Exists
62
+
63
+ The goal is not to preserve every capability of the original model equally. This build deliberately prioritizes:
64
+
65
+ - reliable OpenAI-compatible tool calls
66
+ - coding and shell-oriented agent use
67
+ - English prompts and codebase work
68
+ - practical inference on a 128 GB Apple Silicon system
69
+
70
+ Chinese-language quality and multimodal use were not optimization targets.
71
+
72
+ ## Serving
73
+
74
+ Install or build llama.cpp with `llama-server` available in `PATH`. Once the model is on Hugging Face, the usual way to run it is directly from the Hub:
75
+
76
+ ```sh
77
+ llama-server \
78
+ -hf jedisct1/MiMo-V2.5-coder-Q2 \
79
+ --host 127.0.0.1 \
80
+ --port 8080 \
81
+ --ctx-size 100000 \
82
+ --parallel 1 \
83
+ --batch-size 512 \
84
+ --ubatch-size 128 \
85
+ --threads 12 \
86
+ --threads-batch 18 \
87
+ --prio 0 \
88
+ --poll 80 \
89
+ --flash-attn on \
90
+ --jinja \
91
+ --fit on \
92
+ --fit-target 4096 \
93
+ --fit-ctx 100000 \
94
+ --gpu-layers auto \
95
+ --cache-type-k f16 \
96
+ --cache-type-v f16 \
97
+ --reasoning off
98
+ ```
99
+
100
+ This starts an OpenAI-compatible server on `127.0.0.1:8080`. The repository contains one GGUF split set, so recent llama.cpp builds should select the first shard automatically.
101
+
102
+ If your llama.cpp build does not auto-select the split file, point it at the first shard explicitly:
103
+
104
+ ```sh
105
+ llama-server \
106
+ -hf jedisct1/MiMo-V2.5-coder-Q2 \
107
+ --hf-file MiMo-V2.5-coder-Q2-00001-of-00016.gguf
108
+ ```
109
+
110
+ If you cloned or downloaded the repository locally, you can use the included helper script instead:
111
+
112
+ ```sh
113
+ ./run-server.sh
114
+ ```
115
+
116
+ The script uses the same defaults and loads the first GGUF shard from the repository directory.
117
+
118
+ Default serving settings:
119
+
120
+ ```sh
121
+ MIMO_CTX=100000
122
+ MIMO_FIT_CTX=100000
123
+ MIMO_FIT_TARGET=4096
124
+ MIMO_BATCH=512
125
+ MIMO_UBATCH=128
126
+ MIMO_REASONING=off
127
+ MIMO_CPU_MOE=0
128
+ ```
129
+
130
+ These defaults are tuned for an Apple M5 Max with 128 GB unified memory. They keep reasoning output disabled, use the model's Jinja chat template, use Flash Attention, and ask llama.cpp to fit as much of the model as possible onto Metal.
131
+
132
+ If you hit memory pressure, use the safer CPU-MoE mode:
133
+
134
+ ```sh
135
+ MIMO_CPU_MOE=1 MIMO_FIT_TARGET=32768 MIMO_BATCH=128 MIMO_UBATCH=64 ./run-server.sh
136
+ ```
137
+
138
+ That mode is slower, especially on long prompt prefill, but it leaves much more Metal memory headroom.
139
+
140
+ You can point directly at a different `llama-server` binary with:
141
+
142
+ ```sh
143
+ LLAMA_SERVER=/path/to/llama-server ./run-server.sh
144
+ ```
145
+
146
+ You can also run `llama-server` directly against local files without the helper script:
147
+
148
+ ```sh
149
+ llama-server \
150
+ --model MiMo-V2.5-coder-Q2-00001-of-00016.gguf \
151
+ --host 127.0.0.1 \
152
+ --port 8080 \
153
+ --ctx-size 100000 \
154
+ --parallel 1 \
155
+ --batch-size 512 \
156
+ --ubatch-size 128 \
157
+ --threads 12 \
158
+ --threads-batch 18 \
159
+ --prio 0 \
160
+ --poll 80 \
161
+ --flash-attn on \
162
+ --jinja \
163
+ --fit on \
164
+ --fit-target 4096 \
165
+ --fit-ctx 100000 \
166
+ --gpu-layers auto \
167
+ --cache-type-k f16 \
168
+ --cache-type-v f16 \
169
+ --reasoning off
170
+ ```
171
+
172
+ For the safer CPU-MoE fallback, add `--cpu-moe` and use a larger fit margin:
173
+
174
+ ```sh
175
+ llama-server \
176
+ --model MiMo-V2.5-coder-Q2-00001-of-00016.gguf \
177
+ --ctx-size 100000 \
178
+ --fit on \
179
+ --fit-target 32768 \
180
+ --fit-ctx 100000 \
181
+ --batch-size 128 \
182
+ --ubatch-size 64 \
183
+ --flash-attn on \
184
+ --jinja \
185
+ --gpu-layers auto \
186
+ --cache-type-k f16 \
187
+ --cache-type-v f16 \
188
+ --reasoning off \
189
+ --cpu-moe
190
+ ```
191
+
192
+ ## Tool-Calling Notes
193
+
194
+ For best tool-calling results:
195
+
196
+ - Use OpenAI-compatible request-provided tool schemas.
197
+ - Keep llama.cpp built-in tools disabled unless you are specifically testing them.
198
+ - Disable model reasoning output with `--reasoning off` or `MIMO_REASONING=off`.
199
+ - Set `parallel_tool_calls` to `false` if your client supports it.
200
+ - Avoid forcing `tool_choice: required`; in testing it made malformed calls more likely.
201
+
202
+ This build was tested with Swival-style tool schemas for `read_file`, `grep`, `outline`, `run_command`, and `todo`.
203
+
204
+ ## Local Test Results
205
+
206
+ On the local 128 GB Apple Silicon M5 setup:
207
+
208
+ - The first local `Q2_K` artifact passed only 5/7 in its best Swival-shaped serving configuration.
209
+ - This imatrix-backed `Q2_K_S` artifact passed 21/21 across three Swival-shaped repeat runs with fast M5 serving defaults.
210
+ - A Swival-style `Hello` request rendered to 4,411 prompt tokens because the client included a system prompt and tool schemas. With fast M5 serving, llama.cpp processed that payload at about 239 prompt tokens/sec. With all MoE tensors on CPU, the same class of prompt processed at about 13 prompt tokens/sec.
211
+
212
+ These are local smoke and agent-harness results, not a public benchmark suite.
213
+
214
+ ## Limitations
215
+
216
+ - Text-only GGUF: no vision, video, or audio encoders.
217
+ - MTP blocks are omitted.
218
+ - The quantization is very low bit. It is intended to fit and run locally, not to match the full BF16 checkpoint.
219
+ - The default 100,000-token context is much smaller than MiMo-V2.5's advertised 1M training context, but much more practical on this hardware.
220
+ - Quality should be validated on your own coding and tool-calling workloads before relying on it.
221
+
222
+ ## License
223
+
224
+ The upstream model card for `XiaomiMiMo/MiMo-V2.5` declares the MIT license. This derived GGUF is provided under the same license metadata.
run-server.sh ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
5
+ LLAMA_SERVER=${LLAMA_SERVER:-llama-server}
6
+
7
+ if ! command -v "$LLAMA_SERVER" >/dev/null 2>&1; then
8
+ echo "llama-server was not found. Install llama.cpp or set LLAMA_SERVER=/path/to/llama-server." >&2
9
+ exit 1
10
+ fi
11
+
12
+ if [[ -n "${MIMO_MODEL:-}" ]]; then
13
+ MODEL=$MIMO_MODEL
14
+ else
15
+ shopt -s nullglob
16
+ CANDIDATES=("$SCRIPT_DIR"/MiMo-V2.5-coder-Q2-00001-of-*.gguf)
17
+ shopt -u nullglob
18
+
19
+ if [[ ${#CANDIDATES[@]} -eq 0 ]]; then
20
+ echo "No first GGUF shard found next to run-server.sh." >&2
21
+ exit 1
22
+ fi
23
+
24
+ MODEL=${CANDIDATES[0]}
25
+ fi
26
+
27
+ ARGS=(
28
+ --model "$MODEL"
29
+ --host "${MIMO_HOST:-127.0.0.1}"
30
+ --port "${MIMO_PORT:-8080}"
31
+ --ctx-size "${MIMO_CTX:-100000}"
32
+ --parallel "${MIMO_PARALLEL:-1}"
33
+ --batch-size "${MIMO_BATCH:-512}"
34
+ --ubatch-size "${MIMO_UBATCH:-128}"
35
+ --threads "${MIMO_THREADS:-12}"
36
+ --threads-batch "${MIMO_THREADS_BATCH:-18}"
37
+ --prio "${MIMO_PRIO:-0}"
38
+ --poll "${MIMO_POLL:-80}"
39
+ --flash-attn on
40
+ --jinja
41
+ --fit "${MIMO_FIT:-on}"
42
+ --fit-target "${MIMO_FIT_TARGET:-4096}"
43
+ --fit-ctx "${MIMO_FIT_CTX:-100000}"
44
+ --gpu-layers "${MIMO_GPU_LAYERS:-auto}"
45
+ --cache-type-k "${MIMO_CACHE_K:-f16}"
46
+ --cache-type-v "${MIMO_CACHE_V:-f16}"
47
+ --reasoning "${MIMO_REASONING:-off}"
48
+ )
49
+
50
+ if [[ "${MIMO_CPU_MOE:-0}" == "1" ]]; then
51
+ ARGS+=(--cpu-moe)
52
+ fi
53
+
54
+ if [[ -n "${MIMO_DEVICE:-}" ]]; then
55
+ ARGS+=(--device "$MIMO_DEVICE")
56
+ fi
57
+
58
+ if [[ -n "${MIMO_TOOLS:-}" ]]; then
59
+ ARGS+=(--tools "$MIMO_TOOLS")
60
+ fi
61
+
62
+ exec "$LLAMA_SERVER" "${ARGS[@]}" "$@"