lukealonso commited on
Commit
635d308
·
verified ·
1 Parent(s): 8e88cbe

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -8
README.md CHANGED
@@ -10,7 +10,7 @@ Not tested extensively, use at your own risk.
10
 
11
  Tested on 4x RTX Pro 6000 Blackwell via:
12
  ```
13
- inference:
14
  image: vllm/vllm-openai:nightly
15
  container_name: inference
16
  ports:
@@ -31,17 +31,26 @@ Tested on 4x RTX Pro 6000 Blackwell via:
31
  - NCCL_SHM_DISABLE=0
32
  - VLLM_USE_V1=1
33
  - VLLM_USE_FLASHINFER_MOE_FP4=1
34
- - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
35
  volumes:
36
- - /models/:/models/
37
  command:
38
- - /models/GLM-4.6-NVFP4
 
 
 
 
 
39
  - --tensor-parallel-size
40
  - "4"
41
  - --gpu-memory-utilization
42
- - "0.9"
 
 
43
  - --dtype
44
- - auto
 
 
45
  - --kv-cache-dtype
46
  - fp8
47
  - --enable-auto-tool-choice
@@ -50,5 +59,4 @@ Tested on 4x RTX Pro 6000 Blackwell via:
50
  - --host
51
  - "0.0.0.0"
52
  - --port
53
- - "8000"
54
- ```
 
10
 
11
  Tested on 4x RTX Pro 6000 Blackwell via:
12
  ```
13
+ inference:
14
  image: vllm/vllm-openai:nightly
15
  container_name: inference
16
  ports:
 
31
  - NCCL_SHM_DISABLE=0
32
  - VLLM_USE_V1=1
33
  - VLLM_USE_FLASHINFER_MOE_FP4=1
34
+ - OMP_NUM_THREADS=8
35
  volumes:
36
+ - /models/GLM-4.6-NVFP4-4:/GLM-4.6:ro
37
  command:
38
+ - /GLM-4.6
39
+ - --enable-expert-parallel
40
+ - --enable-prefix-caching
41
+ - --enable-chunked-prefill
42
+ - --served-model-name
43
+ - "GLM-4.6"
44
  - --tensor-parallel-size
45
  - "4"
46
  - --gpu-memory-utilization
47
+ - "0.95"
48
+ - --max-num-batched-tokens
49
+ - "16384"
50
  - --dtype
51
+ - "auto"
52
+ - --max-num-seqs
53
+ - "8"
54
  - --kv-cache-dtype
55
  - fp8
56
  - --enable-auto-tool-choice
 
59
  - --host
60
  - "0.0.0.0"
61
  - --port
62
+ - "8000"```