Hanrui / sglang /docs /platforms /ascend_npu_deepseek_example.md

Add files using upload-large-folder tool

a227c91 verified about 2 months ago

9.33 kB

	## DeepSeek examples

	### Running DeepSeek-V3

	#### Running DeepSeek in PD mixed mode on 1 x Atlas 800I A3.

	W4A8 Model weights could be found [here](https://modelers.cn/models/Modelers_Park/DeepSeek-R1-0528-w4a8).

	```shell
	export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
	export STREAMS_PER_DEVICE=32

	#Deepep communication settings
	export DEEP_NORMAL_MODE_USE_INT8_QUANT=1
	export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=32
	export HCCL_BUFFSIZE=1600

	#spec overlap
	export SGLANG_ENABLE_SPEC_V2=1
	export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1

	#npu acceleration operator
	export SGLANG_NPU_USE_MLAPO=1
	export SGLANG_USE_FIA_NZ=1

	python3 -m sglang.launch_server \
	--model-path ${MODEL_PATH} \
	--tp 16 \
	--trust-remote-code \
	--attention-backend ascend \
	--device npu \
	--quantization modelslim \
	--watchdog-timeout 9000 \
	--cuda-graph-bs 8 16 24 28 32 \
	--mem-fraction-static 0.68 \
	--max-running-requests 128 \
	--context-length 8188 \
	--disable-radix-cache \
	--chunked-prefill-size -1 \
	--max-prefill-tokens 16384 \
	--moe-a2a-backend deepep \
	--deepep-mode auto \
	--enable-dp-attention \
	--dp-size 4 \
	--enable-dp-lm-head \
	--speculative-algorithm NEXTN \
	--speculative-num-steps 3 \
	--speculative-eagle-topk 1 \
	--speculative-num-draft-tokens 4 \
	--dtype bfloat16
	```

	#### Running DeepSeek with PD disaggregation mode on 2 x Atlas 800I A3.

	W4A8 Model weights could be found [here](https://modelers.cn/models/Modelers_Park/DeepSeek-R1-0528-w4a8).

	1. Prefill:

	```shell
	export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
	export STREAMS_PER_DEVICE=32

	#memfabric config store
	export ASCEND_MF_STORE_URL="tcp://<PREFILL_HOST_IP>:<PORT>"

	#Deepep communication settings
	export DEEP_NORMAL_MODE_USE_INT8_QUANT=1
	export HCCL_BUFFSIZE=1536

	#npu acceleration operator
	export SGLANG_NPU_USE_MLAPO=1
	export SGLANG_USE_FIA_NZ=1
	export TASK_QUEUE_ENABLE=2

	python -m sglang.launch_server \
	--model-path ${MODEL_PATH} \
	--host $PREFILL_HOST_IP \
	--port 8000 \
	--disaggregation-mode prefill \
	--disaggregation-bootstrap-port 8996 \
	--disaggregation-transfer-backend ascend \
	--trust-remote-code \
	--nnodes 1 \
	--node-rank 0 \
	--tp-size 16 \
	--mem-fraction-static 0.6 \
	--attention-backend ascend \
	--device npu \
	--quantization modelslim \
	--load-balance-method round_robin \
	--max-running-requests 8 \
	--context-length 8192 \
	--disable-radix-cache \
	--chunked-prefill-size -1 \
	--max-prefill-tokens 28680 \
	--moe-a2a-backend deepep \
	--deepep-mode normal \
	--speculative-algorithm NEXTN \
	--speculative-num-steps 3 \
	--speculative-eagle-topk 1 \
	--speculative-num-draft-tokens 4 \
	--dp-size 2 \
	--enable-dp-attention \
	--disable-shared-experts-fusion \
	--dtype bfloat16
	```

	2. Decode:

	```shell
	export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
	export STREAMS_PER_DEVICE=32

	#memfabric config store
	export ASCEND_MF_STORE_URL="tcp://<PREFILL_HOST_IP>:<PORT>"

	#Deepep communication settings
	export HCCL_BUFFSIZE=720
	export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=88

	#spec overlap
	export SGLANG_ENABLE_SPEC_V2=1
	export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1

	#npu acceleration operator
	unset TASK_QUEUE_ENABLE
	export SGLANG_NPU_USE_MLAPO=1
	export SGLANG_USE_FIA_NZ=1

	# suggest max-running-requests <= max-cuda-graph-bs * dp_size, Because when this value is exceeded, performance will significantly degrade.
	python -m sglang.launch_server \
	--model-path ${MODEL_PATH} \
	--disaggregation-mode decode \
	--host $DECODE_HOST_IP \
	--port 8001 \
	--trust-remote-code \
	--nnodes 1 \
	--node-rank 0 \
	--tp-size 16 \
	--dp-size 16 \
	--mem-fraction-static 0.8 \
	--max-running-requests 352 \
	--attention-backend ascend \
	--device npu \
	--quantization modelslim \
	--prefill-round-robin-balance \
	--moe-a2a-backend deepep \
	--enable-dp-attention \
	--deepep-mode low_latency \
	--enable-dp-lm-head \
	--cuda-graph-bs 8 10 12 14 16 18 20 22 \
	--disaggregation-transfer-backend ascend \
	--watchdog-timeout 9000 \
	--context-length 8192 \
	--speculative-algorithm NEXTN \
	--speculative-num-steps 3 \
	--speculative-eagle-topk 1 \
	--speculative-num-draft-tokens 4 \
	--disable-shared-experts-fusion \
	--dtype bfloat16 \
	--tokenizer-worker-num 4
	```

	3. SGLang Model Gateway (former Router)

	```shell
	python -m sglang_router.launch_router \
	--pd-disaggregation \
	--policy cache_aware \
	--prefill http://<PREFILL_HOST_IP>:8000 8996 \
	--decode http://<DECODE_HOST_IP>:8001 \
	--host 127.0.0.1 \
	--port 6688
	```

	#### Running DeepSeek with PD disaggregation on 4 x Atlas 800I A3.

	W8A8 Model weights could be found [here](https://modelers.cn/models/State_Cloud/Deepseek-R1-bf16-hfd-w8a8).

	1. Prefill & Decode:

	```shell
	echo performance \| tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
	sysctl -w vm.swappiness=0
	sysctl -w kernel.numa_balancing=0
	sysctl -w kernel.sched_migration_cost_ns=50000
	export SGLANG_SET_CPU_AFFINITY=1
	unset ASCEND_LAUNCH_BLOCKING
	source /usr/local/Ascend/ascend-toolkit/set_env.sh
	source /usr/local/Ascend/nnal/atb/set_env.sh
	export PATH=/usr/local/Ascend/8.5.0/compiler/bishengir/bin:$PATH

	export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
	export STREAMS_PER_DEVICE=32

	export ASCEND_MF_STORE_URL="tcp://your prefill ip1:24669"

	P_IP=('your prefill ip1' 'your prefill ip2')

	D_IP=('your decode ip1' 'your decode ip2')

	MODEL_PATH=xxx

	export SGLANG_NPU_USE_MLAPO=1
	export SGLANG_USE_FIA_NZ=1

	LOCAL_HOST1=`hostname -I\|awk -F " " '{print$1}'`
	LOCAL_HOST2=`hostname -I\|awk -F " " '{print$2}'`
	echo "${LOCAL_HOST1}"
	echo "${LOCAL_HOST2}"
	# prefill
	for i in "${!P_IP[@]}";
	do
	if [[ "$LOCAL_HOST1" == "${P_IP[$i]}" \|\| "$LOCAL_HOST2" == "${P_IP[$i]}" ]];
	then
	echo "${P_IP[$i]}"
	export HCCL_BUFFSIZE=1536
	export DEEP_NORMAL_MODE_USE_INT8_QUANT=1
	export TASK_QUEUE_ENABLE=2

	export HCCL_SOCKET_IFNAME=lo
	export GLOO_SOCKET_IFNAME=lo
	python -m sglang.launch_server --model-path ${MODEL_PATH} --disaggregation-mode prefill --host ${P_IP[$i]} \
	--port 8000 --disaggregation-bootstrap-port $((8998+$i)) --trust-remote-code --nnodes 1 --node-rank 0 \
	--tp-size 16 --mem-fraction-static 0.81 --attention-backend ascend --device npu --quantization modelslim \
	--disaggregation-transfer-backend ascend --max-running-requests 8 --context-length 8192 --disable-radix-cache \
	--chunked-prefill-size -1 --max-prefill-tokens 28680 --moe-a2a-backend deepep --deepep-mode normal \
	--speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \
	--dp-size 2 --enable-dp-attention --disable-shared-experts-fusion --dtype bfloat16 --enable-attn-tp-input-scattered
	NODE_RANK=$i
	break
	fi
	done

	# decode
	for i in "${!D_IP[@]}";
	do
	if [[ "$LOCAL_HOST1" == "${D_IP[$i]}" \|\| "$LOCAL_HOST2" == "${D_IP[$i]}" ]];
	then
	echo "${D_IP[$i]}"
	export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
	export SGLANG_ENABLE_SPEC_V2=1
	export HCCL_BUFFSIZE=650
	export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=78
	export TASK_QUEUE_ENABLE=1
	export SGLANG_SCHEDULER_SKIP_ALL_GATHER=1
	export HCCL_SOCKET_IFNAME=xxx
	export GLOO_SOCKET_IFNAME=xxx
	python -m sglang.launch_server --model-path ${MODEL_PATH} --disaggregation-mode decode --host ${D_IP[$i]} \
	--port 8001 --trust-remote-code --dist-init-addr ${D_IP[0]}:5000 --nnodes 2 --node-rank $i --tp-size 32 --dp-size 32 \
	--mem-fraction-static 0.815 --max-running-requests 832 --attention-backend ascend --device npu --quantization modelslim \
	--moe-a2a-backend deepep --enable-dp-attention --deepep-mode low_latency --enable-dp-lm-head --moe-dense-tp 1 \
	--cuda-graph-bs 12 14 16 18 20 22 24 26 --disaggregation-transfer-backend ascend --watchdog-timeout 9000 --context-length 8192 \
	--speculative-algorithm NEXTN --speculative-num-steps 2 --speculative-eagle-topk 1 --speculative-num-draft-tokens 3 \
	--tokenizer-worker-num 4 --prefill-round-robin-balance --disable-shared-experts-fusion --dtype bfloat16 \
	--load-balance-method decode_round_robin
	NODE_RANK=$i
	break
	fi
	done
	```

	2. SGLang Model Gateway (former Router):

	```shell
	export SGLANG_DP_ROUND_ROBIN=1
	python -m sglang_router.launch_router \
	--pd-disaggregation \
	--policy cache_aware \
	--prefill http://P_IP:8000 8998 \
	--prefill http://P_IP:8000 8999 \
	--decode http://D_IP:8001 \
	--host 127.0.0.1 \
	--port 6688 \
	--mini-lb
	```

	#### test gsm8k

	```python
	from types import SimpleNamespace
	from sglang.test.few_shot_gsm8k import run_eval

	def gsm8k():
	args = SimpleNamespace(
	num_shots=5,
	data_path=None,
	num_questions=200,
	max_new_tokens=512,
	parallel=32,
	host=f"http://127.0.0.1",
	port=6688,
	)
	metrics = run_eval(args)
	print(f"{metrics=}")
	print(f"{metrics['accuracy']=}")
	if __name__ == "__main__":
	gsm8k()
	```