|
|
|
|
|
"""Convert a local BF16 model into Marlin-supported quant formats via llm-compressor.""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import gc |
|
|
import os |
|
|
import sys |
|
|
from typing import Optional |
|
|
|
|
|
import torch |
|
|
from datasets import load_dataset |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
|
|
|
LLM_COMPRESSOR_SRC = "/home/quixi/marlin-cdna/llm-compressor/src" |
|
|
if os.path.isdir(LLM_COMPRESSOR_SRC): |
|
|
sys.path.insert(0, LLM_COMPRESSOR_SRC) |
|
|
|
|
|
from llmcompressor import oneshot |
|
|
from llmcompressor.modifiers.awq import AWQModifier |
|
|
from llmcompressor.modifiers.quantization import ( |
|
|
GPTQModifier, |
|
|
QuantizationModifier, |
|
|
) |
|
|
|
|
|
MODEL_PATH = "/home/quixi/models/Llama-3.2-1B" |
|
|
OUTPUT_ROOT = "/home/quixi/models" |
|
|
|
|
|
CALIB_DATASET_ID = "HuggingFaceH4/ultrachat_200k" |
|
|
CALIB_DATASET_SPLIT = "train_sft" |
|
|
NUM_CALIBRATION_SAMPLES = 128 |
|
|
MAX_SEQUENCE_LENGTH = 512 |
|
|
|
|
|
|
|
|
def _load_tokenized_dataset(tokenizer): |
|
|
ds = load_dataset( |
|
|
CALIB_DATASET_ID, |
|
|
split=f"{CALIB_DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]", |
|
|
).shuffle(seed=42) |
|
|
|
|
|
def preprocess(example): |
|
|
return { |
|
|
"text": tokenizer.apply_chat_template( |
|
|
example["messages"], |
|
|
tokenize=False, |
|
|
) |
|
|
} |
|
|
|
|
|
ds = ds.map(preprocess) |
|
|
|
|
|
def tokenize(sample): |
|
|
return tokenizer( |
|
|
sample["text"], |
|
|
padding=False, |
|
|
max_length=MAX_SEQUENCE_LENGTH, |
|
|
truncation=True, |
|
|
add_special_tokens=False, |
|
|
) |
|
|
|
|
|
return ds.map(tokenize, remove_columns=ds.column_names) |
|
|
|
|
|
|
|
|
def _load_model_and_tokenizer(): |
|
|
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, dtype="auto") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
|
|
if torch.cuda.is_available(): |
|
|
model.to("cuda") |
|
|
return model, tokenizer |
|
|
|
|
|
|
|
|
def _cleanup(model, tokenizer): |
|
|
del model |
|
|
del tokenizer |
|
|
gc.collect() |
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
|
|
|
def _run_recipe( |
|
|
name: str, |
|
|
recipe, |
|
|
*, |
|
|
save_compressed: bool, |
|
|
use_calibration: bool, |
|
|
) -> Optional[str]: |
|
|
print(f"\n=== Quantizing {name} ===") |
|
|
model, tokenizer = _load_model_and_tokenizer() |
|
|
|
|
|
oneshot_kwargs = {"model": model, "recipe": recipe} |
|
|
if use_calibration: |
|
|
ds = _load_tokenized_dataset(tokenizer) |
|
|
oneshot_kwargs.update( |
|
|
dataset=ds, |
|
|
max_seq_length=MAX_SEQUENCE_LENGTH, |
|
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES, |
|
|
) |
|
|
|
|
|
oneshot(**oneshot_kwargs) |
|
|
|
|
|
base_name = os.path.basename(MODEL_PATH.rstrip("/")) |
|
|
save_dir = os.path.join(OUTPUT_ROOT, f"{base_name}-{name}") |
|
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
|
|
if save_compressed: |
|
|
model.save_pretrained(save_dir, save_compressed=True) |
|
|
else: |
|
|
model.save_pretrained(save_dir) |
|
|
tokenizer.save_pretrained(save_dir) |
|
|
|
|
|
_cleanup(model, tokenizer) |
|
|
return save_dir |
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
_run_recipe( |
|
|
"W4A16-GPTQ", |
|
|
GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]), |
|
|
save_compressed=True, |
|
|
use_calibration=True, |
|
|
) |
|
|
|
|
|
|
|
|
_run_recipe( |
|
|
"W4A16-AWQ", |
|
|
AWQModifier( |
|
|
targets=["Linear"], |
|
|
scheme="W4A16_ASYM", |
|
|
ignore=["lm_head"], |
|
|
duo_scaling="both", |
|
|
), |
|
|
save_compressed=True, |
|
|
use_calibration=True, |
|
|
) |
|
|
|
|
|
|
|
|
_run_recipe( |
|
|
"W8A16-GPTQ", |
|
|
GPTQModifier(targets="Linear", scheme="W8A16", ignore=["lm_head"]), |
|
|
save_compressed=True, |
|
|
use_calibration=True, |
|
|
) |
|
|
|
|
|
|
|
|
_run_recipe( |
|
|
"FP8-Dynamic", |
|
|
QuantizationModifier(targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]), |
|
|
save_compressed=False, |
|
|
use_calibration=False, |
|
|
) |
|
|
|
|
|
|
|
|
_run_recipe( |
|
|
"NVFP4A16", |
|
|
QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"]), |
|
|
save_compressed=True, |
|
|
use_calibration=False, |
|
|
) |
|
|
|
|
|
|
|
|
_run_recipe( |
|
|
"MXFP4", |
|
|
QuantizationModifier(targets="Linear", scheme="MXFP4", ignore=["lm_head"]), |
|
|
save_compressed=True, |
|
|
use_calibration=False, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |