Quantization script:

# Create a dedicated python env
python3 -m venv llmcompressor
source llmcompressor/bin/activate

# Install llm-compressor and additionnal needed libs
pip install llmcompressor qwen_vl_utils torchvision

# Download model in HF cache
hf download Qwen/Qwen2.5-VL-7B-Instruct

# Prepare quantization script
## Download the GPTQ (INT4) as it is the closest to what we need to acheive (it includes the calibration phase)
wget https://github.com/vllm-project/llm-compressor/raw/refs/tags/0.8.1/examples/multimodal_vision/qwen_2_5_vl_example.py -O qwen_2_5_vl_gptq.py
## Create the patch file for NVFP4
cat << EOF > nvfp4.patch
--- qwen_2_5_vl_gptq.py 2025-10-20 13:34:15.446886854 +0200
+++ qwen_2_5_vl_fp4.py  2025-10-19 17:44:04.932080648 +0200
@@ -7,7 +7,7 @@
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

 from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation

 # Load model.
@@ -69,13 +69,11 @@


 # Recipe
-recipe = [
-    GPTQModifier(
-        targets="Linear",
-        scheme="W4A16",
-        ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
-    ),
-]
+recipe = QuantizationModifier(
+    targets="Linear",
+    scheme="NVFP4",
+    ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
+)

 # Perform oneshot
 oneshot(
@@ -122,6 +120,6 @@


 # Save to disk compressed.
-SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)
EOF
## Apply the patch
patch qwen_2_5_vl_gptq.py -i nvfp4.patch -o qwen_2_5_vl_nvfp4.py

# Start the quantization
python3 qwen_2_5_vl_nvfp4.py