lukealonso
/

MiniMax-M2-NVFP4

8-bit precision

Model card Files Files and versions

lukealonso commited on Nov 7, 2025

Commit

7fd9221

·

verified ·

1 Parent(s): 2acbc0f

Create README.md

Files changed (1) hide show

README.md +60 -0

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+base_model:
+- MiniMaxAI/MiniMax-M2
+---
+modelopt quantized NVFP4 MiniMax-M2
+Tested on 2x RTX Pro 6000 Blackwell
+ inference:
+    image: vllm/vllm-openai:nightly
+    container_name: inference
+    ports:
+      - "0.0.0.0:8000:8000"
+    gpus: "all"
+    shm_size: "32g"
+    ipc: "host"
+    ulimits:
+      memlock: -1
+      nofile: 1048576
+    environment:
+      - NCCL_IB_DISABLE=1
+      - NCCL_NVLS_ENABLE=0
+      - NCCL_P2P_DISABLE=0
+      - NCCL_SHM_DISABLE=0
+      - VLLM_USE_V1=1
+      - VLLM_USE_FLASHINFER_MOE_FP4=1
+      - OMP_NUM_THREADS=8
+      - SAFETENSORS_FAST_GPU=1
+    volumes:
+      - /dev/shm:/dev/shm
+    command:
+      - lukealonso/MiniMax-M2-NVFP4
+      - --enable-auto-tool-choice
+      - --tool-call-parser
+      - minimax_m2
+      - --reasoning-parser
+      - minimax_m2_append_think
+      - --all2all-backend
+      - pplx
+      - --enable-prefix-caching
+      - --enable-chunked-prefill
+      - --served-model-name
+      - "MiniMax-M2"
+      - --tensor-parallel-size
+      - "2"
+      - --gpu-memory-utilization
+      - "0.95"
+      - --max-num-batched-tokens
+      - "16384"
+      - --dtype
+      - "auto"
+      - --max-num-seqs
+      - "8"
+      - --kv-cache-dtype
+      - fp8
+      - --host
+      - "0.0.0.0"
+      - --port
+      - "8000"