Upload examples/start_serving_openpangu_r_72b_2512.sh with huggingface_hub
Browse files
examples/start_serving_openpangu_r_72b_2512.sh
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
|
| 2 |
+
|
| 3 |
+
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
| 4 |
+
export VLLM_USE_V1=1
|
| 5 |
+
export VLLM_WORKER_MULTIPROC_METHOD=fork
|
| 6 |
+
export VLLM_ENABLE_MC2=0
|
| 7 |
+
export USING_LCCL_COM=0
|
| 8 |
+
|
| 9 |
+
export OMNI_USE_PANGU=1
|
| 10 |
+
export ENABLE_PREFILL_TND=1
|
| 11 |
+
|
| 12 |
+
export HCCL_OP_EXPANSION_MODE="AIV"
|
| 13 |
+
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
|
| 14 |
+
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
| 15 |
+
|
| 16 |
+
export HCCL_RDMA_TIMEOUT=5
|
| 17 |
+
export HCCL_DETERMINISTIC=False
|
| 18 |
+
export ASCEND_GLOBAL_LOG_LEVEL=3
|
| 19 |
+
export CPU_AFFINITY_CONF=2
|
| 20 |
+
export VLLM_LOGGING_LEVEL=INFO
|
| 21 |
+
|
| 22 |
+
export HCCL_BUFFSIZE=1000
|
| 23 |
+
export HCCL_CONNECT_TIMEOUT=1800
|
| 24 |
+
export HCCL_EXEC_TIMEOUT=1800
|
| 25 |
+
export HCCL_INTRA_ROCE_ENABLE=1
|
| 26 |
+
export HCCL_INTRA_PCIE_ENABLE=0
|
| 27 |
+
export FORCE_ENABLE_CHUNK_PREFILL=1
|
| 28 |
+
|
| 29 |
+
export USE_REASONING=${USE_REASONING:=1}
|
| 30 |
+
export USE_TOOL=${USE_TOOL:=1}
|
| 31 |
+
if [ "$USE_REASONING" = "1" ]; then
|
| 32 |
+
reasoning="--reasoning-parser pangu"
|
| 33 |
+
fi
|
| 34 |
+
if [ "$USE_TOOL" = "1" ]; then
|
| 35 |
+
tools="--enable-auto-tool-choice --tool-call-parser pangu"
|
| 36 |
+
fi
|
| 37 |
+
|
| 38 |
+
export PYTHONPATH=/path/to/omniinfer/:$PYTHONPATH
|
| 39 |
+
rm -r -f .torchair_cache/
|
| 40 |
+
|
| 41 |
+
python start_api_servers.py \
|
| 42 |
+
--num-servers 1 \
|
| 43 |
+
--model-path /path/to/model/ \
|
| 44 |
+
--master-ip 0.0.0.0 \
|
| 45 |
+
--tp 4 \
|
| 46 |
+
--num-dp 1 \
|
| 47 |
+
--master-port 3512 \
|
| 48 |
+
--served-model-name openpangu_r_72b_2512 \
|
| 49 |
+
--log-dir apiserverlog_pangu72B_hybrid_chunk \
|
| 50 |
+
--extra-args "--max-num-batched-tokens 2048 --enforce-eager --no-enable-prefix-caching --enable-expert-parallel --max-num-seqs 32 --long-prefill-token-threshold 1024" \
|
| 51 |
+
--base-api-port 8000 \
|
| 52 |
+
--gpu-util 0.90 \
|
| 53 |
+
--no-enable-prefix-caching \
|
| 54 |
+
--max-model-len 131072 \
|
| 55 |
+
$reasoning \
|
| 56 |
+
$tools \
|
| 57 |
+
--additional-config '{"graph_model_compile_config":{"level":1, "use_ge_graph_cached":true, "decode_gear_list": [32]}, "enable_hybrid_graph_mode": false, "expert_parallel_size": 4, "expert_tensor_parallel_size": 1}' &
|