Update README.md
Browse files
README.md
CHANGED
|
@@ -10,7 +10,7 @@ Not tested extensively, use at your own risk.
|
|
| 10 |
|
| 11 |
Tested on 4x RTX Pro 6000 Blackwell via:
|
| 12 |
```
|
| 13 |
-
|
| 14 |
image: vllm/vllm-openai:nightly
|
| 15 |
container_name: inference
|
| 16 |
ports:
|
|
@@ -31,17 +31,26 @@ Tested on 4x RTX Pro 6000 Blackwell via:
|
|
| 31 |
- NCCL_SHM_DISABLE=0
|
| 32 |
- VLLM_USE_V1=1
|
| 33 |
- VLLM_USE_FLASHINFER_MOE_FP4=1
|
| 34 |
-
-
|
| 35 |
volumes:
|
| 36 |
-
- /models/:/
|
| 37 |
command:
|
| 38 |
-
- /
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
- --tensor-parallel-size
|
| 40 |
- "4"
|
| 41 |
- --gpu-memory-utilization
|
| 42 |
-
- "0.
|
|
|
|
|
|
|
| 43 |
- --dtype
|
| 44 |
-
- auto
|
|
|
|
|
|
|
| 45 |
- --kv-cache-dtype
|
| 46 |
- fp8
|
| 47 |
- --enable-auto-tool-choice
|
|
@@ -50,5 +59,4 @@ Tested on 4x RTX Pro 6000 Blackwell via:
|
|
| 50 |
- --host
|
| 51 |
- "0.0.0.0"
|
| 52 |
- --port
|
| 53 |
-
- "8000"
|
| 54 |
-
```
|
|
|
|
| 10 |
|
| 11 |
Tested on 4x RTX Pro 6000 Blackwell via:
|
| 12 |
```
|
| 13 |
+
inference:
|
| 14 |
image: vllm/vllm-openai:nightly
|
| 15 |
container_name: inference
|
| 16 |
ports:
|
|
|
|
| 31 |
- NCCL_SHM_DISABLE=0
|
| 32 |
- VLLM_USE_V1=1
|
| 33 |
- VLLM_USE_FLASHINFER_MOE_FP4=1
|
| 34 |
+
- OMP_NUM_THREADS=8
|
| 35 |
volumes:
|
| 36 |
+
- /models/GLM-4.6-NVFP4-4:/GLM-4.6:ro
|
| 37 |
command:
|
| 38 |
+
- /GLM-4.6
|
| 39 |
+
- --enable-expert-parallel
|
| 40 |
+
- --enable-prefix-caching
|
| 41 |
+
- --enable-chunked-prefill
|
| 42 |
+
- --served-model-name
|
| 43 |
+
- "GLM-4.6"
|
| 44 |
- --tensor-parallel-size
|
| 45 |
- "4"
|
| 46 |
- --gpu-memory-utilization
|
| 47 |
+
- "0.95"
|
| 48 |
+
- --max-num-batched-tokens
|
| 49 |
+
- "16384"
|
| 50 |
- --dtype
|
| 51 |
+
- "auto"
|
| 52 |
+
- --max-num-seqs
|
| 53 |
+
- "8"
|
| 54 |
- --kv-cache-dtype
|
| 55 |
- fp8
|
| 56 |
- --enable-auto-tool-choice
|
|
|
|
| 59 |
- --host
|
| 60 |
- "0.0.0.0"
|
| 61 |
- --port
|
| 62 |
+
- "8000"```
|
|
|