give a very rough full offload example
Browse files
README.md
CHANGED
|
@@ -230,10 +230,27 @@ $ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
|
|
| 230 |
$ cmake --build build --config Release -j $(nproc)
|
| 231 |
|
| 232 |
# Hybrid CPU and Single GPU
|
| 233 |
-
echo TODO or look at my Step-3.5-Flash for rough example for now
|
| 234 |
-
|
| 235 |
-
# Hybrid CPU and Multi GPU
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
# CPU-Only
|
| 239 |
numactl -N "$SOCKET" -m "$SOCKET" \
|
|
|
|
| 230 |
$ cmake --build build --config Release -j $(nproc)
|
| 231 |
|
| 232 |
# Hybrid CPU and Single GPU
|
| 233 |
+
echo TODO or look at my Step-3.5-Flash for rough example for now using --cpu-moe or --n-cpu-moe XX etc
|
| 234 |
+
|
| 235 |
+
# Hybrid CPU and Multi GPU 128k context full offload in 96GB VRAM
|
| 236 |
+
model=MiniMax-M2.5-IQ2_KS-00001-of-00003.gguf
|
| 237 |
+
_GLIBCXX_REGEX_STATE_LIMIT=1000000 \
|
| 238 |
+
CUDA_VISIBLE_DEVICES="0,1" \
|
| 239 |
+
./build/bin/llama-sweep-bench \
|
| 240 |
+
--model "$model" \
|
| 241 |
+
--alias ubergarm/MiniMax-M2.5 \
|
| 242 |
+
-khad -ctk q6_0 -ctv q8_0 \
|
| 243 |
+
-c 131072 \
|
| 244 |
+
-ger \
|
| 245 |
+
-sm graph \
|
| 246 |
+
-ngl 99 \
|
| 247 |
+
-ub 4096 -b 4096 \
|
| 248 |
+
-ts 47,48 \
|
| 249 |
+
--threads 1 \
|
| 250 |
+
--host 127.0.0.1 \
|
| 251 |
+
--port 8080 \
|
| 252 |
+
--no-mmap \
|
| 253 |
+
--jinja
|
| 254 |
|
| 255 |
# CPU-Only
|
| 256 |
numactl -N "$SOCKET" -m "$SOCKET" \
|