add tensor parallel example for 2x48 GB GPUs
Browse files
README.md
CHANGED
|
@@ -266,11 +266,26 @@ $ cd ik_llama.cpp
|
|
| 266 |
$ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
|
| 267 |
$ cmake --build build --config Release -j $(nproc)
|
| 268 |
|
|
|
|
| 269 |
## https://github.com/ikawrakow/ik_llama.cpp/pull/1236
|
| 270 |
## https://github.com/ikawrakow/ik_llama.cpp/pull/1231
|
| 271 |
## https://github.com/ikawrakow/ik_llama.cpp/pull/1239
|
| 272 |
## https://github.com/ikawrakow/ik_llama.cpp/pull/1240
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
# CPU-only Mainline llama.cpp Example
|
| 276 |
numactl -N "$SOCKET" -m "$SOCKET" \
|
|
|
|
| 266 |
$ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
|
| 267 |
$ cmake --build build --config Release -j $(nproc)
|
| 268 |
|
| 269 |
+
# Run full offload on >2 GPUs with `-sm graph` Graph Parallel
|
| 270 |
## https://github.com/ikawrakow/ik_llama.cpp/pull/1236
|
| 271 |
## https://github.com/ikawrakow/ik_llama.cpp/pull/1231
|
| 272 |
## https://github.com/ikawrakow/ik_llama.cpp/pull/1239
|
| 273 |
## https://github.com/ikawrakow/ik_llama.cpp/pull/1240
|
| 274 |
+
CUDA_VISIBLE_DEVICES="0,1" \
|
| 275 |
+
./build/bin/llama-server \
|
| 276 |
+
--model "$model" \
|
| 277 |
+
--alias ubergarm/Step-Fun-3.5-Flash \
|
| 278 |
+
-c 65536 \
|
| 279 |
+
-ger \
|
| 280 |
+
-sm graph \
|
| 281 |
+
-ngl 99 \
|
| 282 |
+
-ub 4096 -b 4096 \
|
| 283 |
+
-ts 47,48 \
|
| 284 |
+
--threads 1 \
|
| 285 |
+
--host 127.0.0.1 \
|
| 286 |
+
--port 8080 \
|
| 287 |
+
--jinja \
|
| 288 |
+
--no-mmap
|
| 289 |
|
| 290 |
# CPU-only Mainline llama.cpp Example
|
| 291 |
numactl -N "$SOCKET" -m "$SOCKET" \
|