ubergarm commited on
Commit
0d7d6c9
·
1 Parent(s): 6fb2415

add tensor parallel example for 2x48 GB GPUs

Browse files
Files changed (1) hide show
  1. README.md +16 -1
README.md CHANGED
@@ -266,11 +266,26 @@ $ cd ik_llama.cpp
266
  $ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
267
  $ cmake --build build --config Release -j $(nproc)
268
 
 
269
  ## https://github.com/ikawrakow/ik_llama.cpp/pull/1236
270
  ## https://github.com/ikawrakow/ik_llama.cpp/pull/1231
271
  ## https://github.com/ikawrakow/ik_llama.cpp/pull/1239
272
  ## https://github.com/ikawrakow/ik_llama.cpp/pull/1240
273
- echo TODO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  # CPU-only Mainline llama.cpp Example
276
  numactl -N "$SOCKET" -m "$SOCKET" \
 
266
  $ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
267
  $ cmake --build build --config Release -j $(nproc)
268
 
269
+ # Run full offload on >2 GPUs with `-sm graph` Graph Parallel
270
  ## https://github.com/ikawrakow/ik_llama.cpp/pull/1236
271
  ## https://github.com/ikawrakow/ik_llama.cpp/pull/1231
272
  ## https://github.com/ikawrakow/ik_llama.cpp/pull/1239
273
  ## https://github.com/ikawrakow/ik_llama.cpp/pull/1240
274
+ CUDA_VISIBLE_DEVICES="0,1" \
275
+ ./build/bin/llama-server \
276
+ --model "$model" \
277
+ --alias ubergarm/Step-Fun-3.5-Flash \
278
+ -c 65536 \
279
+ -ger \
280
+ -sm graph \
281
+ -ngl 99 \
282
+ -ub 4096 -b 4096 \
283
+ -ts 47,48 \
284
+ --threads 1 \
285
+ --host 127.0.0.1 \
286
+ --port 8080 \
287
+ --jinja \
288
+ --no-mmap
289
 
290
  # CPU-only Mainline llama.cpp Example
291
  numactl -N "$SOCKET" -m "$SOCKET" \