## DeepSeek examples ### Running DeepSeek-V3 #### Running DeepSeek in PD mixed mode on 1 x Atlas 800I A3. W4A8 Model weights could be found [here](https://modelers.cn/models/Modelers_Park/DeepSeek-R1-0528-w4a8). ```shell export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export STREAMS_PER_DEVICE=32 #Deepep communication settings export DEEP_NORMAL_MODE_USE_INT8_QUANT=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32 export HCCL_BUFFSIZE=1600 #spec overlap export SGLANG_ENABLE_SPEC_V2=1 export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 #npu acceleration operator export SGLANG_NPU_USE_MLAPO=1 export SGLANG_USE_FIA_NZ=1 python3 -m sglang.launch_server \ --model-path ${MODEL_PATH} \ --tp 16 \ --trust-remote-code \ --attention-backend ascend \ --device npu \ --quantization modelslim \ --watchdog-timeout 9000 \ --cuda-graph-bs 8 16 24 28 32 \ --mem-fraction-static 0.68 \ --max-running-requests 128 \ --context-length 8188 \ --disable-radix-cache \ --chunked-prefill-size -1 \ --max-prefill-tokens 16384 \ --moe-a2a-backend deepep \ --deepep-mode auto \ --enable-dp-attention \ --dp-size 4 \ --enable-dp-lm-head \ --speculative-algorithm NEXTN \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \ --dtype bfloat16 ``` #### Running DeepSeek with PD disaggregation mode on 2 x Atlas 800I A3. W4A8 Model weights could be found [here](https://modelers.cn/models/Modelers_Park/DeepSeek-R1-0528-w4a8). 1. Prefill: ```shell export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export STREAMS_PER_DEVICE=32 #memfabric config store export ASCEND_MF_STORE_URL="tcp://:" #Deepep communication settings export DEEP_NORMAL_MODE_USE_INT8_QUANT=1 export HCCL_BUFFSIZE=1536 #npu acceleration operator export SGLANG_NPU_USE_MLAPO=1 export SGLANG_USE_FIA_NZ=1 export TASK_QUEUE_ENABLE=2 python -m sglang.launch_server \ --model-path ${MODEL_PATH} \ --host $PREFILL_HOST_IP \ --port 8000 \ --disaggregation-mode prefill \ --disaggregation-bootstrap-port 8996 \ --disaggregation-transfer-backend ascend \ --trust-remote-code \ --nnodes 1 \ --node-rank 0 \ --tp-size 16 \ --mem-fraction-static 0.6 \ --attention-backend ascend \ --device npu \ --quantization modelslim \ --load-balance-method round_robin \ --max-running-requests 8 \ --context-length 8192 \ --disable-radix-cache \ --chunked-prefill-size -1 \ --max-prefill-tokens 28680 \ --moe-a2a-backend deepep \ --deepep-mode normal \ --speculative-algorithm NEXTN \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \ --dp-size 2 \ --enable-dp-attention \ --disable-shared-experts-fusion \ --dtype bfloat16 ``` 2. Decode: ```shell export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export STREAMS_PER_DEVICE=32 #memfabric config store export ASCEND_MF_STORE_URL="tcp://:" #Deepep communication settings export HCCL_BUFFSIZE=720 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=88 #spec overlap export SGLANG_ENABLE_SPEC_V2=1 export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 #npu acceleration operator unset TASK_QUEUE_ENABLE export SGLANG_NPU_USE_MLAPO=1 export SGLANG_USE_FIA_NZ=1 # suggest max-running-requests <= max-cuda-graph-bs * dp_size, Because when this value is exceeded, performance will significantly degrade. python -m sglang.launch_server \ --model-path ${MODEL_PATH} \ --disaggregation-mode decode \ --host $DECODE_HOST_IP \ --port 8001 \ --trust-remote-code \ --nnodes 1 \ --node-rank 0 \ --tp-size 16 \ --dp-size 16 \ --mem-fraction-static 0.8 \ --max-running-requests 352 \ --attention-backend ascend \ --device npu \ --quantization modelslim \ --prefill-round-robin-balance \ --moe-a2a-backend deepep \ --enable-dp-attention \ --deepep-mode low_latency \ --enable-dp-lm-head \ --cuda-graph-bs 8 10 12 14 16 18 20 22 \ --disaggregation-transfer-backend ascend \ --watchdog-timeout 9000 \ --context-length 8192 \ --speculative-algorithm NEXTN \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \ --disable-shared-experts-fusion \ --dtype bfloat16 \ --tokenizer-worker-num 4 ``` 3. SGLang Model Gateway (former Router) ```shell python -m sglang_router.launch_router \ --pd-disaggregation \ --policy cache_aware \ --prefill http://:8000 8996 \ --decode http://:8001 \ --host 127.0.0.1 \ --port 6688 ``` #### Running DeepSeek with PD disaggregation on 4 x Atlas 800I A3. W8A8 Model weights could be found [here](https://modelers.cn/models/State_Cloud/Deepseek-R1-bf16-hfd-w8a8). 1. Prefill & Decode: ```shell echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor sysctl -w vm.swappiness=0 sysctl -w kernel.numa_balancing=0 sysctl -w kernel.sched_migration_cost_ns=50000 export SGLANG_SET_CPU_AFFINITY=1 unset ASCEND_LAUNCH_BLOCKING source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/nnal/atb/set_env.sh export PATH=/usr/local/Ascend/8.5.0/compiler/bishengir/bin:$PATH export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export STREAMS_PER_DEVICE=32 export ASCEND_MF_STORE_URL="tcp://your prefill ip1:24669" P_IP=('your prefill ip1' 'your prefill ip2') D_IP=('your decode ip1' 'your decode ip2') MODEL_PATH=xxx export SGLANG_NPU_USE_MLAPO=1 export SGLANG_USE_FIA_NZ=1 LOCAL_HOST1=`hostname -I|awk -F " " '{print$1}'` LOCAL_HOST2=`hostname -I|awk -F " " '{print$2}'` echo "${LOCAL_HOST1}" echo "${LOCAL_HOST2}" # prefill for i in "${!P_IP[@]}"; do if [[ "$LOCAL_HOST1" == "${P_IP[$i]}" || "$LOCAL_HOST2" == "${P_IP[$i]}" ]]; then echo "${P_IP[$i]}" export HCCL_BUFFSIZE=1536 export DEEP_NORMAL_MODE_USE_INT8_QUANT=1 export TASK_QUEUE_ENABLE=2 export HCCL_SOCKET_IFNAME=lo export GLOO_SOCKET_IFNAME=lo python -m sglang.launch_server --model-path ${MODEL_PATH} --disaggregation-mode prefill --host ${P_IP[$i]} \ --port 8000 --disaggregation-bootstrap-port $((8998+$i)) --trust-remote-code --nnodes 1 --node-rank 0 \ --tp-size 16 --mem-fraction-static 0.81 --attention-backend ascend --device npu --quantization modelslim \ --disaggregation-transfer-backend ascend --max-running-requests 8 --context-length 8192 --disable-radix-cache \ --chunked-prefill-size -1 --max-prefill-tokens 28680 --moe-a2a-backend deepep --deepep-mode normal \ --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \ --dp-size 2 --enable-dp-attention --disable-shared-experts-fusion --dtype bfloat16 --enable-attn-tp-input-scattered NODE_RANK=$i break fi done # decode for i in "${!D_IP[@]}"; do if [[ "$LOCAL_HOST1" == "${D_IP[$i]}" || "$LOCAL_HOST2" == "${D_IP[$i]}" ]]; then echo "${D_IP[$i]}" export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 export SGLANG_ENABLE_SPEC_V2=1 export HCCL_BUFFSIZE=650 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=78 export TASK_QUEUE_ENABLE=1 export SGLANG_SCHEDULER_SKIP_ALL_GATHER=1 export HCCL_SOCKET_IFNAME=xxx export GLOO_SOCKET_IFNAME=xxx python -m sglang.launch_server --model-path ${MODEL_PATH} --disaggregation-mode decode --host ${D_IP[$i]} \ --port 8001 --trust-remote-code --dist-init-addr ${D_IP[0]}:5000 --nnodes 2 --node-rank $i --tp-size 32 --dp-size 32 \ --mem-fraction-static 0.815 --max-running-requests 832 --attention-backend ascend --device npu --quantization modelslim \ --moe-a2a-backend deepep --enable-dp-attention --deepep-mode low_latency --enable-dp-lm-head --moe-dense-tp 1 \ --cuda-graph-bs 12 14 16 18 20 22 24 26 --disaggregation-transfer-backend ascend --watchdog-timeout 9000 --context-length 8192 \ --speculative-algorithm NEXTN --speculative-num-steps 2 --speculative-eagle-topk 1 --speculative-num-draft-tokens 3 \ --tokenizer-worker-num 4 --prefill-round-robin-balance --disable-shared-experts-fusion --dtype bfloat16 \ --load-balance-method decode_round_robin NODE_RANK=$i break fi done ``` 2. SGLang Model Gateway (former Router): ```shell export SGLANG_DP_ROUND_ROBIN=1 python -m sglang_router.launch_router \ --pd-disaggregation \ --policy cache_aware \ --prefill http://P_IP:8000 8998 \ --prefill http://P_IP:8000 8999 \ --decode http://D_IP:8001 \ --host 127.0.0.1 \ --port 6688 \ --mini-lb ``` #### test gsm8k ```python from types import SimpleNamespace from sglang.test.few_shot_gsm8k import run_eval def gsm8k(): args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=512, parallel=32, host=f"http://127.0.0.1", port=6688, ) metrics = run_eval(args) print(f"{metrics=}") print(f"{metrics['accuracy']=}") if __name__ == "__main__": gsm8k() ```