vllm fails due to unknown quantization
Hi,
Mistral has made some changes in VLLM in a recent release, it fails to load the config due to unknown quantisation config. Same happens to mistral fp8, etc. But voxtral needs more attention due to multimodal.
I´ve vibe-patched that, works for me, but maybe someone should take a look:
--- a/.venv/lib/python3.12/site-packages/vllm/transformers_utils/configs/mistral.py
+++ b/.venv/lib/python3.12/site-packages/vllm/transformers_utils/configs/mistral.py
@@ -174,8 +174,13 @@ def _remap_mistral_quantization_args(config: dict) -> dict:
"quant_method": "fp8",
"activation_scheme": "dynamic" if is_dynamic else "static",
}
else:
raise ValueError(f"Found unknown quantization='{quantization}' in config")
# Pass through non-Mistral quantization formats (e.g., compressed-tensors)# to vLLM's standard quantization handlinglogger.info(f"Found non-Mistral quantization format in config, passing through: "f"{quantization.get('quant_method', 'unknown')}")
return configconfig["quantization_config"] = quantization
@@ -217,8 +222,14 @@ def _remap_mistral_audio_args(config: dict) -> dict:
)
quant_config = config.get("quantization_config")
Create text_config with quantization_config if present
- text_config_dict = dict(config)
- text_config = PretrainedConfig.from_dict(text_config_dict)
- if quant_config:
text_config.quantization_config = quant_configconfig = { "model_type": "voxtral", "architectures": [architecture],
"text_config": PretrainedConfig.from_dict(config),
"text_config": text_config, "audio_config": WhisperConfig( num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"], window_size=encoder_args["audio_encoding_args"]["window_size"],
Here is the error of the stock vllm:
Initializing a V1 LLM engine (v0.14.0rc1.dev227+gb53b89fdb) with config: model='RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic', speculative_config=None, tokenizer='RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic', skip_tokenizer_init=False, tokenizer_mode=mistral, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=mistral, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=fp8, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False), seed=0, served_model_name=RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False}, 'local_cache_dir': None}
.......................
pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelConfig
Value error, Found unknown quantization='{'config_groups': {'group_0': {'input_activations': {'actorder': None, 'block_structure': None, 'dynamic': True, 'group_size': None, 'num_bits': 8, 'observer': None, 'observer_kwargs': {}, 'strategy': 'token', 'symmetric': True, 'type': 'float'}, 'output_activations': None, 'targets': ['Linear'], 'weights': {'actorder': None, 'block_structure': None, 'dynamic': False, 'group_size': None, 'num_bits': 8, 'observer': 'minmax', 'observer_kwargs': {}, 'strategy': 'channel', 'symmetric': True, 'type': 'float'}}}, 'format': 'float-quantized', 'global_compression_ratio': None, 'ignore': ['mm_whisper_embeddings.whisper_encoder.transformer.layers.0.attention.wk', ....................................................... 'mm_whisper_embeddings.whisper_encoder.transformer.layers.31.feed_forward.w2', 'output', 'mm_whisper_embeddings.audio_language_projection.0', 'mm_whisper_embeddings.audio_language_projection.2'], 'kv_cache_scheme': None, 'quant_method': 'compressed-tensors', 'quantization_status': 'compressed'}' in config [type=value_error, input_value=ArgsKwargs((), {'model': ...rocessor_plugin': None}), input_type=ArgsKwargs]