|
|
import argparse |
|
|
|
|
|
import torch |
|
|
from datasets import load_dataset |
|
|
from transformers import AutoProcessor, AutoModelForImageTextToText |
|
|
|
|
|
from llmcompressor import oneshot |
|
|
from llmcompressor.modifiers.quantization import QuantizationModifier |
|
|
from llmcompressor.utils import dispatch_for_generation |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_args(): |
|
|
parser = argparse.ArgumentParser(description="Quantize Molmo2 model") |
|
|
parser.add_argument( |
|
|
"--model-id", |
|
|
type=str, |
|
|
default="allenai/Molmo2-4B", |
|
|
help="HuggingFace model ID (default: allenai/Molmo2-8B)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--quant-type", |
|
|
type=str, |
|
|
choices=["nvfp4", "fp8"], |
|
|
default="nvfp4", |
|
|
help="Quantization type: nvfp4 or fp8 (default: nvfp4)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--num-calibration-samples", |
|
|
type=int, |
|
|
default=256, |
|
|
help="Number of calibration samples (default: 256)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-seq-length", |
|
|
type=int, |
|
|
default=8192, |
|
|
help="Maximum sequence length (default: 8192)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output-dir", |
|
|
type=str, |
|
|
default=None, |
|
|
help="Output directory (default: auto-generated based on model and quant type)", |
|
|
) |
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def get_quantization_recipe(quant_type: str) -> QuantizationModifier: |
|
|
"""Get quantization recipe based on quantization type.""" |
|
|
ignore_patterns = [ |
|
|
"re:.*lm_head", |
|
|
"re:.*vision_backbone.*", |
|
|
"re:.*mlp.gate$", |
|
|
] |
|
|
|
|
|
if quant_type == "nvfp4": |
|
|
|
|
|
return QuantizationModifier( |
|
|
targets="Linear", |
|
|
scheme="NVFP4", |
|
|
ignore=ignore_patterns, |
|
|
) |
|
|
elif quant_type == "fp8": |
|
|
|
|
|
return QuantizationModifier( |
|
|
targets="Linear", |
|
|
scheme="FP8", |
|
|
ignore=ignore_patterns, |
|
|
) |
|
|
else: |
|
|
raise ValueError(f"Unsupported quantization type: {quant_type}") |
|
|
|
|
|
|
|
|
args = parse_args() |
|
|
|
|
|
MODEL_ID = args.model_id |
|
|
QUANT_TYPE = args.quant_type.upper() |
|
|
NUM_CALIBRATION_SAMPLES = args.num_calibration_samples |
|
|
MAX_SEQUENCE_LENGTH = args.max_seq_length |
|
|
|
|
|
print(f"Model: {MODEL_ID}") |
|
|
print(f"Quantization: {QUANT_TYPE}") |
|
|
print(f"Calibration samples: {NUM_CALIBRATION_SAMPLES}") |
|
|
print(f"Max sequence length: {MAX_SEQUENCE_LENGTH}") |
|
|
|
|
|
|
|
|
model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, torch_dtype="auto", trust_remote_code=True) |
|
|
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
|
|
|
|
DATASET_ID = "neuralmagic/calibration" |
|
|
|
|
|
ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]") |
|
|
|
|
|
|
|
|
def preprocess_function(example): |
|
|
messgages = [] |
|
|
for message in example["messages"]: |
|
|
messgages.append( |
|
|
{ |
|
|
"role": message["role"], |
|
|
"content": [{"type": "text", "text": message["content"]}], |
|
|
} |
|
|
) |
|
|
|
|
|
return processor.apply_chat_template( |
|
|
messgages, |
|
|
return_tensors="pt", |
|
|
padding=False, |
|
|
truncation=True, |
|
|
max_length=MAX_SEQUENCE_LENGTH, |
|
|
tokenize=True, |
|
|
add_special_tokens=False, |
|
|
return_dict=True, |
|
|
add_generation_prompt=False, |
|
|
) |
|
|
|
|
|
|
|
|
ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names) |
|
|
|
|
|
|
|
|
def data_collator(batch): |
|
|
assert len(batch) == 1 |
|
|
return { |
|
|
key: ( |
|
|
torch.tensor(value) |
|
|
if key != "pixel_values" |
|
|
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) |
|
|
) |
|
|
for key, value in batch[0].items() |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
recipe = get_quantization_recipe(args.quant_type) |
|
|
|
|
|
|
|
|
oneshot( |
|
|
model=model, |
|
|
processor=processor, |
|
|
recipe=recipe, |
|
|
max_seq_length=MAX_SEQUENCE_LENGTH, |
|
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES, |
|
|
dataset=ds, |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
print("========== SAMPLE GENERATION ==============") |
|
|
dispatch_for_generation(model) |
|
|
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") |
|
|
output = model.generate(input_ids, max_new_tokens=20) |
|
|
print(processor.decode(output[0])) |
|
|
print("==========================================") |
|
|
|
|
|
|
|
|
|
|
|
if args.output_dir: |
|
|
SAVE_DIR = args.output_dir |
|
|
else: |
|
|
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + f"-{QUANT_TYPE}" |
|
|
|
|
|
print(f"Saving to: {SAVE_DIR}") |
|
|
model.save_pretrained(SAVE_DIR) |
|
|
|
|
|
|
|
|
try: |
|
|
processor.save_pretrained(SAVE_DIR) |
|
|
except AttributeError: |
|
|
|
|
|
if hasattr(processor, "tokenizer"): |
|
|
processor.tokenizer.save_pretrained(SAVE_DIR) |
|
|
if hasattr(processor, "image_processor"): |
|
|
processor.image_processor.save_pretrained(SAVE_DIR) |
|
|
|
|
|
print("Done!") |