File size: 4,925 Bytes
e66e0ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# Intern-S1-Pro Deployment Guide
The Intern-S1-Pro release is a 1T parameter model stored in FP8 format. Deployment requires at least two 8-GPU H200 nodes, with either of the following configurations:
- Tensor Parallelism (TP)
- Data Parallelism (DP) + Expert Parallelism (EP)
> NOTE: The deployment examples in this guide are provided for reference only and may not represent the latest or most optimized configurations. Inference frameworks are under active development — always consult the official documentation from each framework’s maintainers to ensure peak performance and compatibility.
## LMDeploy
Required version `lmdeploy>=0.12.0`
- Tensor Parallelism
```bash
# start ray on node 0 and node 1
# node 0
lmdeploy serve api_server internlm/Intern-S1-Pro --backend pytorch --tp 16
```
- Data Parallelism + Expert Parallelism
```
# node 0, proxy server
lmdeploy serve proxy --server-name ${proxy_server_ip} --server-port ${proxy_server_port} --routing-strategy 'min_expected_latency' --serving-strategy Hybrid
# node 0
export LMDEPLOY_DP_MASTER_ADDR=${node0_ip}
export LMDEPLOY_DP_MASTER_PORT=29555
lmdeploy serve api_server \
internlm/Intern-S1-Pro \
--backend pytorch \
--tp 1 \
--dp 16 \
--ep 16 \
--proxy-url http://${proxy_server_ip}:${proxy_server_port} \
--nnodes 2 \
--node-rank 0 \
--reasoning-parser intern-s1 \
--tool-call-parser qwen3
# node 1
export LMDEPLOY_DP_MASTER_ADDR=${node0_ip}
export LMDEPLOY_DP_MASTER_PORT=29555
lmdeploy serve api_server \
internlm/Intern-S1-Pro \
--backend pytorch \
--tp 1 \
--dp 16 \
--ep 16 \
--proxy-url http://${proxy_server_ip}:${proxy_server_port} \
--nnodes 2 \
--node-rank 1 \
--reasoning-parser intern-s1 \
--tool-call-parser qwen3
```
## vLLM
- Tensor Parallelism + Expert Parallelism
```bash
# start ray on node 0 and node 1
# node 0
export VLLM_ENGINE_READY_TIMEOUT_S=10000
vllm serve internlm/Intern-S1-Pro \
--tensor-parallel-size 16 \
--enable-expert-parallel \
--distributed-executor-backend ray \
--max-model-len 65536 \
--trust-remote-code \
--reasoning-parser deepseek_r1 \
--enable-auto-tool-choice \
--tool-call-parser hermes
```
- Data Parallelism + Expert Parallelism
```bash
# node 0
export VLLM_ENGINE_READY_TIMEOUT_S=10000
vllm serve internlm/Intern-S1-Pro \
--all2all-backend deepep_low_latency \
--tensor-parallel-size 1 \
--enable-expert-parallel \
--data-parallel-size 16 \
--data-parallel-size-local 8 \
--data-parallel-address ${node0_ip} \
--data-parallel-rpc-port 13345 \
--gpu_memory_utilization 0.8 \
--mm_processor_cache_gb=0 \
--media-io-kwargs '{"video": {"num_frames": 768, "fps": 2}}' \
--max-model-len 65536 \
--trust-remote-code \
--api-server-count=8 \
--reasoning-parser deepseek_r1 \
--enable-auto-tool-choice \
--tool-call-parser hermes
# node 1
export VLLM_ENGINE_READY_TIMEOUT_S=10000
vllm serve internlm/Intern-S1-Pro \
--all2all-backend deepep_low_latency \
--tensor-parallel-size 1 \
--enable-expert-parallel \
--data-parallel-size 16 \
--data-parallel-size-local 8 \
--data-parallel-start-rank 8 \
--data-parallel-address ${node0_ip} \
--data-parallel-rpc-port 13345 \
--gpu_memory_utilization 0.8 \
--mm_processor_cache_gb=0 \
--media-io-kwargs '{"video": {"num_frames": 768, "fps": 2}}' \
--max-model-len 65536 \
--trust-remote-code \
--headless \
--reasoning-parser deepseek_r1 \
--enable-auto-tool-choice \
--tool-call-parser hermes
```
> NOTE: To prevent out-of-memory (OOM) errors, we limit the context length using `--max-model-len 65536`. For datasets requiring longer responses, you may increase this value as needed. Additionally, video inference can consume substantial memory in vLLM API server processes; we therefore recommend setting `--media-io-kwargs '{"video": {"num_frames": 768, "fps": 2}}'` to constrain preprocessing memory usage during video benchmarking.
## SGLang
- Tensor Parallelism + Expert Parallelism
```bash
export DIST_ADDR=${master_node_ip}:${master_node_port}
# node 0
python3 -m sglang.launch_server \
--model-path internlm/Intern-S1-Pro \
--tp 16 \
--ep 16 \
--mem-fraction-static 0.85 \
--trust-remote-code \
--dist-init-addr ${DIST_ADDR} \
--nnodes 2 \
--attention-backend fa3 \
--mm-attention-backend fa3 \
--keep-mm-feature-on-device \
--node-rank 0 \
--reasoning-parser qwen3 \
--tool-call-parser qwen
# node 1
python3 -m sglang.launch_server \
--model-path internlm/Intern-S1-Pro \
--tp 16 \
--ep 16 \
--mem-fraction-static 0.85 \
--trust-remote-code \
--dist-init-addr ${DIST_ADDR} \
--nnodes 2 \
--attention-backend fa3 \
--mm-attention-backend fa3 \
--keep-mm-feature-on-device \
--node-rank 1 \
--reasoning-parser qwen3 \
--tool-call-parser qwen
```
|