lukealonso
/

GLM-4.6-NVFP4

8-bit precision

Model card Files Files and versions

lukealonso commited on Oct 26, 2025

Commit

635d308

·

verified ·

1 Parent(s): 8e88cbe

Update README.md

Files changed (1) hide show

README.md +16 -8

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ Not tested extensively, use at your own risk.
 Tested on 4x RTX Pro 6000 Blackwell via:
 ```
-  inference:
     image: vllm/vllm-openai:nightly
     container_name: inference
     ports:
@@ -31,17 +31,26 @@ Tested on 4x RTX Pro 6000 Blackwell via:
       - NCCL_SHM_DISABLE=0
       - VLLM_USE_V1=1
       - VLLM_USE_FLASHINFER_MOE_FP4=1
-      - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
     volumes:
-      - /models/:/models/
     command:
-      - /models/GLM-4.6-NVFP4
       - --tensor-parallel-size
       - "4"
       - --gpu-memory-utilization
-      - "0.9"
       - --dtype
-      - auto
       - --kv-cache-dtype
       - fp8
       - --enable-auto-tool-choice
@@ -50,5 +59,4 @@ Tested on 4x RTX Pro 6000 Blackwell via:
       - --host
       - "0.0.0.0"
       - --port
-      - "8000"
-```

 Tested on 4x RTX Pro 6000 Blackwell via:
 ```
+ inference:
     image: vllm/vllm-openai:nightly
     container_name: inference
     ports:
       - NCCL_SHM_DISABLE=0
       - VLLM_USE_V1=1
       - VLLM_USE_FLASHINFER_MOE_FP4=1
+      - OMP_NUM_THREADS=8
     volumes:
+      - /models/GLM-4.6-NVFP4-4:/GLM-4.6:ro
     command:
+      - /GLM-4.6
+      - --enable-expert-parallel
+      - --enable-prefix-caching
+      - --enable-chunked-prefill
+      - --served-model-name
+      - "GLM-4.6"
       - --tensor-parallel-size
       - "4"
       - --gpu-memory-utilization
+      - "0.95"
+      - --max-num-batched-tokens
+      - "16384"
       - --dtype
+      - "auto"
+      - --max-num-seqs
+      - "8"
       - --kv-cache-dtype
       - fp8
       - --enable-auto-tool-choice
       - --host
       - "0.0.0.0"
       - --port
+      - "8000"```