ubergarm
/

Step-3.5-Flash-GGUF

@@ -266,11 +266,26 @@ $ cd ik_llama.cpp
 $ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
 $ cmake --build build --config Release -j $(nproc)
 ## https://github.com/ikawrakow/ik_llama.cpp/pull/1236
 ## https://github.com/ikawrakow/ik_llama.cpp/pull/1231
 ## https://github.com/ikawrakow/ik_llama.cpp/pull/1239
 ## https://github.com/ikawrakow/ik_llama.cpp/pull/1240
-echo TODO
 # CPU-only Mainline llama.cpp Example
 numactl -N "$SOCKET" -m "$SOCKET" \

 $ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
 $ cmake --build build --config Release -j $(nproc)
+# Run full offload on >2 GPUs with `-sm graph` Graph Parallel
 ## https://github.com/ikawrakow/ik_llama.cpp/pull/1236
 ## https://github.com/ikawrakow/ik_llama.cpp/pull/1231
 ## https://github.com/ikawrakow/ik_llama.cpp/pull/1239
 ## https://github.com/ikawrakow/ik_llama.cpp/pull/1240
+CUDA_VISIBLE_DEVICES="0,1" \
+./build/bin/llama-server \
+  --model "$model" \
+  --alias ubergarm/Step-Fun-3.5-Flash \
+  -c 65536 \
+  -ger \
+  -sm graph \
+  -ngl 99 \
+  -ub 4096 -b 4096 \
+  -ts 47,48 \
+  --threads 1 \
+  --host 127.0.0.1 \
+  --port 8080 \
+  --jinja \
+  --no-mmap
 # CPU-only Mainline llama.cpp Example
 numactl -N "$SOCKET" -m "$SOCKET" \