File size: 1,387 Bytes
7fd9221 eb19275 7fd9221 eb19275 97cd2e3 27044df 7fd9221 da3aa9e 7fd9221 d8993b1 7fd9221 27044df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
---
base_model:
- MiniMaxAI/MiniMax-M2
---
modelopt NVFP4 quantized MiniMax-M2
Tested (but not extensively validated) on *2x* RTX Pro 6000 Blackwell via:
```
inference:
image: vllm/vllm-openai:nightly-96142f209453a381fcaf9d9d010bbf8711119a77
container_name: inference
ports:
- "0.0.0.0:8000:8000"
gpus: "all"
shm_size: "32g"
ipc: "host"
ulimits:
memlock: -1
nofile: 1048576
environment:
- NCCL_IB_DISABLE=1
- NCCL_NVLS_ENABLE=0
- NCCL_P2P_DISABLE=0
- NCCL_SHM_DISABLE=0
- VLLM_USE_V1=1
- VLLM_USE_FLASHINFER_MOE_FP4=1
- OMP_NUM_THREADS=8
- SAFETENSORS_FAST_GPU=1
volumes:
- /dev/shm:/dev/shm
command:
- lukealonso/MiniMax-M2-NVFP4
- --enable-auto-tool-choice
- --tool-call-parser
- minimax_m2
- --reasoning-parser
- minimax_m2_append_think
- --all2all-backend
- pplx
- --enable-expert-parallel
- --enable-prefix-caching
- --enable-chunked-prefill
- --served-model-name
- "MiniMax-M2"
- --tensor-parallel-size
- "2"
- --gpu-memory-utilization
- "0.95"
- --max-num-batched-tokens
- "16384"
- --dtype
- "auto"
- --max-num-seqs
- "8"
- --kv-cache-dtype
- fp8
- --host
- "0.0.0.0"
- --port
- "8000"
``` |