Qwen2.5-VL
Collection
4 items
•
Updated
Quantization script:
# Create a dedicated python env
python3 -m venv llmcompressor
source llmcompressor/bin/activate
# Install llm-compressor and additionnal needed libs
pip install llmcompressor qwen_vl_utils torchvision
# Download model in HF cache
hf download Qwen/Qwen2.5-VL-7B-Instruct
# Prepare quantization script
## Download the GPTQ (INT4) as it is the closest to what we need to acheive (it includes the calibration phase)
wget https://github.com/vllm-project/llm-compressor/raw/refs/tags/0.8.1/examples/multimodal_vision/qwen_2_5_vl_example.py -O qwen_2_5_vl_gptq.py
## Create the patch file for NVFP4
cat << EOF > nvfp4.patch
--- qwen_2_5_vl_gptq.py 2025-10-20 13:34:15.446886854 +0200
+++ qwen_2_5_vl_fp4.py 2025-10-19 17:44:04.932080648 +0200
@@ -7,7 +7,7 @@
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.utils import dispatch_for_generation
# Load model.
@@ -69,13 +69,11 @@
# Recipe
-recipe = [
- GPTQModifier(
- targets="Linear",
- scheme="W4A16",
- ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
- ),
-]
+recipe = QuantizationModifier(
+ targets="Linear",
+ scheme="NVFP4",
+ ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
+)
# Perform oneshot
oneshot(
@@ -122,6 +120,6 @@
# Save to disk compressed.
-SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
EOF
## Apply the patch
patch qwen_2_5_vl_gptq.py -i nvfp4.patch -o qwen_2_5_vl_nvfp4.py
# Start the quantization
python3 qwen_2_5_vl_nvfp4.py
Base model
Qwen/Qwen2.5-VL-7B-Instruct