File size: 1,387 Bytes
7fd9221
 
 
 
 
eb19275
7fd9221
eb19275
97cd2e3
27044df
7fd9221
da3aa9e
7fd9221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8993b1
7fd9221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27044df
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
---
base_model:
- MiniMaxAI/MiniMax-M2
---

modelopt NVFP4 quantized MiniMax-M2

Tested (but not extensively validated) on *2x* RTX Pro 6000 Blackwell via:

```
 inference:
    image: vllm/vllm-openai:nightly-96142f209453a381fcaf9d9d010bbf8711119a77
    container_name: inference
    ports:
      - "0.0.0.0:8000:8000"
    gpus: "all"
    shm_size: "32g"
    ipc: "host"
    ulimits:
      memlock: -1
      nofile: 1048576
    environment:
      - NCCL_IB_DISABLE=1
      - NCCL_NVLS_ENABLE=0
      - NCCL_P2P_DISABLE=0
      - NCCL_SHM_DISABLE=0
      - VLLM_USE_V1=1
      - VLLM_USE_FLASHINFER_MOE_FP4=1
      - OMP_NUM_THREADS=8
      - SAFETENSORS_FAST_GPU=1
    volumes:
      - /dev/shm:/dev/shm
    command:
      - lukealonso/MiniMax-M2-NVFP4
      - --enable-auto-tool-choice
      - --tool-call-parser
      - minimax_m2
      - --reasoning-parser
      - minimax_m2_append_think
      - --all2all-backend
      - pplx
      - --enable-expert-parallel
      - --enable-prefix-caching
      - --enable-chunked-prefill
      - --served-model-name
      - "MiniMax-M2"
      - --tensor-parallel-size
      - "2"
      - --gpu-memory-utilization
      - "0.95"
      - --max-num-batched-tokens
      - "16384"
      - --dtype
      - "auto"
      - --max-num-seqs
      - "8"
      - --kv-cache-dtype
      - fp8
      - --host
      - "0.0.0.0"
      - --port
      - "8000"
```