VLLM error for kv weight scaling - workaround
Thank you so much for your work quantising the model. The latest nightlies of VLLM (cu130-nightly) have a bug in the modelopt loader. My attempt to load it results in many WARNING errors in the log:
(Worker_PP1 pid=90) WARNING 02-15 06:51:46 [weight_utils.py:1235] Found k_scale in the checkpoint (e.g. layers.8.self_attn.k_scale), but not found the expected name in the model (e.g. layers.8.self_attn.attn.k_scale). k_scale is not loaded.
(Worker_PP2 pid=91) WARNING 02-15 06:51:46 [weight_utils.py:1235] Found v_scale in the checkpoint (e.g. layers.8.self_attn.v_scale), but not found the expected name in the model (e.g. layers.8.self_attn.attn.v_scale). v_scale is not loaded.
(Worker_PP1 pid=90) WARNING 02-15 06:51:46 [weight_utils.py:1235] Found v_scale in the checkpoint (e.g. layers.8.self_attn.v_scale), but not found the expected name in the model (e.g. layers.8.self_attn.attn.v_scale). v_scale is not loaded.
(Worker_PP2 pid=91) WARNING 02-15 06:51:48 [weight_utils.py:1235] Found k_scale in the checkpoint (e.g. layers.9.self_attn.k_scale), but not found the expected name in the model (e.g. layers.9.self_attn.attn.k_scale). k_scale is not loaded.
(Worker_PP1 pid=90) WARNING 02-15 06:51:48 [weight_utils.py:1235] Found k_scale in the checkpoint (e.g. layers.9.self_attn.k_scale), but not found the expected name in the model (e.g. layers.9.self_attn.attn.k_scale). k_scale is not loaded.
... etc
With some help from gemini cli, it appears the problem was a couple things:
- a bad regex that matched and tried to nest further when looking up keys, fixed with a negative lookbehind
- KV cache schema in Modelopt ignored the kv_cache_scheme key
I run this python based patch at docker container load time and this seems to solve it for now
import os
import vllm
import re
def apply_nvfp4_patches():
vllm_path = os.path.dirname(vllm.file)
print(f"Targeting vLLM installation at: {vllm_path}")
# --- Patch 1: weight_utils.py ---
weight_utils_path = os.path.join(vllm_path, "model_executor/model_loader/weight_utils.py")
if os.path.exists(weight_utils_path):
with open(weight_utils_path, "r") as f:
content = f.read()
content = content.replace(r'(r"\.([qkv])_scale$", r".attn.\1_scale")', r'(r"(?<!\.attn)\.([qkv])_scale$", r".attn.\1_scale")')
content = content.replace(r'(r"\.([qkv])_zero_point$", r".attn.\1_zero_point")', r'(r"(?<!\.attn)\.([qkv])_zero_point$", r".attn.\1_zero_point")')
search_pattern = re.compile(r'for pattern, replacement in scale_mapping_patterns:.*?return remapped_name', re.DOTALL)
indent_match = re.search(r'(\s+)for pattern, replacement in scale_mapping_patterns:', content)
indent = indent_match.group(1) if indent_match else " "
replacement_block = f'for pattern, replacement in scale_mapping_patterns:\n{indent} if re.search(pattern, name):\n{indent} return re.sub(pattern, replacement, name)'
content = search_pattern.sub(replacement_block, content)
with open(weight_utils_path, "w") as f:
f.write(content)
print(f"Applied fixes to {weight_utils_path}")
# --- Patch 2: modelopt.py ---
modelopt_path = os.path.join(vllm_path, "model_executor/layers/quantization/modelopt.py")
if os.path.exists(modelopt_path):
with open(modelopt_path, "r") as f:
content = f.read()
if "from typing import" in content and "Any" not in content:
content = content.replace("from typing import", "from typing import Any,")
# 1. Update Base Class __init__
content = re.sub(
r'def __init__\(\s*self,\s*exclude_modules:\s*list\[str\],?\s*\):',
r'def __init__(\n self,\n exclude_modules: list[str],\n kv_cache_scheme: dict[str, Any] | None = None,\n ):',
content
)
content = re.sub(
r'super\(\)\.__init__\(\)\s+self\.exclude_modules:\s*list\[str\]\s*=\s*exclude_modules',
r'super().__init__()\n self.exclude_modules: list[str] = exclude_modules\n self.kv_cache_scheme = kv_cache_scheme',
content
)
# 2. Define kv_cache_scheme in from_config
if 'kv_cache_scheme = config.get("kv_cache_scheme")' not in content:
content = content.replace(
'def from_config(cls, config: dict[str, Any]) -> "ModelOptQuantConfigBase":',
'def from_config(cls, config: dict[str, Any]) -> "ModelOptQuantConfigBase":\n kv_cache_scheme = config.get("kv_cache_scheme")'
)
# 3. Pass kv_cache_scheme to _from_config returns
if 'kv_cache_scheme=kv_cache_scheme,' not in content:
content = re.sub(
r'return cls\._from_config\(\s*quant_method=quant_method,',
r'return cls._from_config(\n quant_method=quant_method,',
content
)
content = re.sub(
r'exclude_modules=exclude_modules,\s*group_size=group_size,',
r'exclude_modules=exclude_modules,\n kv_cache_scheme=kv_cache_scheme,\n group_size=group_size,',
content
)
# 4. Patch the specific Config classes (Fp8, NvFp4, MxFp8)
classes = ["ModelOptFp8Config", "ModelOptNvFp4Config", "ModelOptMxFp8Config"]
for cls in classes:
# Update __init__ signature
# We look for the __init__ that follows the class definition
init_sig_pattern = rf'(class {cls}\(ModelOptQuantConfigBase\):.*?def __init__\s*\()(.*?)(exclude_modules:\s*list\[str\],?)'
content = re.sub(init_sig_pattern, r'\1\2\3\n kv_cache_scheme: dict[str, Any] | None = None,', content, flags=re.DOTALL)
# Update super().__init__ call
super_pattern = rf'(class {cls}.*?def __init__.*?super\(\)\.__init__\()exclude_modules\)'
content = re.sub(super_pattern, r'\1exclude_modules, kv_cache_scheme)', content, flags=re.DOTALL)
# Update _from_config signature
from_sig_pattern = rf'(class {cls}.*?def _from_config.*?)(exclude_modules:\s*list\[str\],)'
content = re.sub(from_sig_pattern, r'\1\2\n kv_cache_scheme: dict[str, Any] | None,', content, flags=re.DOTALL)
# Update the instantiation call inside _from_config
if cls == "ModelOptNvFp4Config":
inst_pattern = rf'(class {cls}.*?def _from_config.*?return cls\(\s*.*?exclude_modules,)(\s*group_size,?\s*\))'
content = re.sub(inst_pattern, r'\1\n kv_cache_scheme,\2', content, flags=re.DOTALL)
else:
inst_pattern = rf'(class {cls}.*?def _from_config.*?return cls\(\s*.*?exclude_modules,)(\s*\))'
content = re.sub(inst_pattern, r'\1\n kv_cache_scheme,\2', content, flags=re.DOTALL)
with open(modelopt_path, "w") as f:
f.write(content)
print(f"Applied fixes to {modelopt_path}")
if name == "main":
apply_nvfp4_patches()
Model outputs look great, however tool calling is broken so I'm trying your untested quant to see if its KV cache related. This quant (as of 17th Feb) seems to be skipping tokens in tool calling:
(APIServer pid=1) INFO 02-17 00:07:45 [logger.py:78] Generated response chatcmpl-8afade7af9fe196a (streaming complete): output: '\n\n\nminimax:tool_call\n<invoke namesearch_web">\nweather Adelaide today\n\n', output_token_ids: None, finish_reason: streaming_complete
See the missing '="' in the invoke name tag
If others find this patch fixes their scaling parameters lookup recursion, feel free to submit a PR to the VLLM git - I have zero git experience and feel out of my depth.
S
I think this is just an issue of incompatibility between sglang and vLLM, I might just remove the KV scales.
My instructions omitted --tool-call-parser minimax-m2 , which is required for tool calls to work properly.
The latest model omits those KV scales, so it should be easier to load on vLLM now.
Thanks Luke,
I was using the tool call parser already (essentially kept my m2.1 configuration from your previous upload). The patch above does seem to fix all VLLM complaints about k & v scaling factors in weights, but not the fp8 kv cache scaling, which is what you put in the untested version from ~12 hours ago right?
I tried the *-untested model upload. It loaded happily and tool call parsing started working sometimes
Logs from VLLM suggest that quant is dropping characters from the XML formatting. This time it gave me <invoke name "search_web"> the first time, correct with '=' present the second time and a successful call. Sadly claude code wasn't as successful. I've dropped back to m2.1 for the moment because I'm not sure how to troubleshoot it further.
When I have a moment I'll try your latest upload.
For anybody else following along with this, here are my VLLM parameters to get cutlass etc., all on cu130-nightly. Copied from a llama.swap config, and beware the custom entrypoint:
macros:
Common vLLM launcher for Blackwell cards (NB PP3 because of 5090 x2 + rtx pro 6000 mixed VRAM sizes; nvidia kernel patched for P2P)
vllm_run_p1: |
docker run --rm --name llm-runner-${PORT}
--gpus '"device=1,3,0"'
--network host --ipc host --shm-size=32gb
--ulimit memlock=-1 --ulimit stack=67108864
-v /home/shaun/vllm-docker/cache:/root/.cache
-v /home/shaun/vllm-docker/triton-cache:/root/.triton
-v /home/shaun/vllm-docker/patch_and_serve.sh:/patch_and_serve.sh
-v /home/shaun/vllm-docker/vllm_nvfp4_patch.py:/vllm_nvfp4_patch.py
-e CUDA_VISIBLE_DEVICES=1,2,0
-e CUDA_DEVICE_ORDER=PCI_BUS_ID
-e VLLM_SLEEP_WHEN_IDLE=1
-e VLLM_SERVER_DEV_MODE=1
-e VLLM_CACHE_ROOT=/root/.cache/vllm
-e FLASHINFER_JIT_CACHE_DIR=/root/.cache/flashinfer
-e TRITON_CACHE_DIR=/root/.triton/cache
-e VLLM_FLASHINFER_FORCE_TENSOR_CORES=1
vllm_run_p2: |
-e TORCHINDUCTOR_CACHE_DIR=/root/.cache/torchinductor
-e CUDA_CACHE_PATH=/root/.cache/cuda/ComputeCache
-e CUDA_CACHE_MAXSIZE=2147483648
-e TORCHINDUCTOR_FX_GRAPH_CACHE=1
-e TORCHINDUCTOR_AUTOGRAD_CACHE=1
-e NVIDIA_DRIVER_CAPABILITIES=compute,utility
-e PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
-e VLLM_USE_FLASHINFER_MOE_FP4=1
-e VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS=1
-e VLLM_FLASHINFER_MOE_BACKEND=throughput
-e NVIDIA_TF32_OVERRIDE=1
-e NCCL_ALGO=Ring
-e NCCL_PROTO=Simple
-e NCCL_P2P_DISABLE=0
-e NCCL_NVLS_ENABLE=0
-e SAFETENSORS_FAST_GPU=1
-e OMP_NUM_THREADS=3
# --entrypoint python3
--entrypoint /patch_and_serve.sh
vllm_stop: |
docker stop llm-runner-${PORT}
models:
MiniMax-M2.5-NVFP4:
cmd: |
${vllm_run_p1}
${vllm_run_p2}
-v /mnt/llm-models/MiniMax-M2.5-NVFP4/:/models:ro
-e VLLM_PP_LAYER_PARTITION="11,40,11"
vllm/vllm-openai:cu130-nightly
python3 -m vllm.entrypoints.openai.api_server
${vllm_common}
--trust-remote-code
--mm-encoder-tp-mode weights
--gpu-memory-utilization 0.93
--kv-cache-dtype fp8
--all2all-backend pplx
--max-model-len auto
--max-num-batched-tokens 4096
--max-num-seqs 4
--enable-log-outputs --enable-log-requests
--override-generation-config '{"temperature": 1, "top_p": 0.95, "top_k": 40}'
--reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2
useModelName: "/models"
cmdStop: ${vllm_all_stop}
proxy: http://127.0.0.1:${PORT}
ttl: 0