lukealonso commited on
Commit
7fd9221
·
verified ·
1 Parent(s): 2acbc0f

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +60 -0
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - MiniMaxAI/MiniMax-M2
4
+ ---
5
+
6
+ modelopt quantized NVFP4 MiniMax-M2
7
+
8
+ Tested on 2x RTX Pro 6000 Blackwell
9
+
10
+ inference:
11
+ image: vllm/vllm-openai:nightly
12
+ container_name: inference
13
+ ports:
14
+ - "0.0.0.0:8000:8000"
15
+ gpus: "all"
16
+ shm_size: "32g"
17
+ ipc: "host"
18
+ ulimits:
19
+ memlock: -1
20
+ nofile: 1048576
21
+ environment:
22
+ - NCCL_IB_DISABLE=1
23
+ - NCCL_NVLS_ENABLE=0
24
+ - NCCL_P2P_DISABLE=0
25
+ - NCCL_SHM_DISABLE=0
26
+ - VLLM_USE_V1=1
27
+ - VLLM_USE_FLASHINFER_MOE_FP4=1
28
+ - OMP_NUM_THREADS=8
29
+ - SAFETENSORS_FAST_GPU=1
30
+ volumes:
31
+ - /dev/shm:/dev/shm
32
+ command:
33
+ - lukealonso/MiniMax-M2-NVFP4
34
+ - --enable-auto-tool-choice
35
+ - --tool-call-parser
36
+ - minimax_m2
37
+ - --reasoning-parser
38
+ - minimax_m2_append_think
39
+ - --all2all-backend
40
+ - pplx
41
+ - --enable-prefix-caching
42
+ - --enable-chunked-prefill
43
+ - --served-model-name
44
+ - "MiniMax-M2"
45
+ - --tensor-parallel-size
46
+ - "2"
47
+ - --gpu-memory-utilization
48
+ - "0.95"
49
+ - --max-num-batched-tokens
50
+ - "16384"
51
+ - --dtype
52
+ - "auto"
53
+ - --max-num-seqs
54
+ - "8"
55
+ - --kv-cache-dtype
56
+ - fp8
57
+ - --host
58
+ - "0.0.0.0"
59
+ - --port
60
+ - "8000"