Llama-3.2-1B / quant.py

Create quant.py

f94ba95 verified about 1 month ago

4.51 kB

	#!/usr/bin/env python3
	"""Convert a local BF16 model into Marlin-supported quant formats via llm-compressor."""

	from __future__ import annotations

	import gc
	import os
	import sys
	from typing import Optional

	import torch
	from datasets import load_dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# Allow running against the local llm-compressor checkout without installing.
	LLM_COMPRESSOR_SRC = "/home/quixi/marlin-cdna/llm-compressor/src"
	if os.path.isdir(LLM_COMPRESSOR_SRC):
	sys.path.insert(0, LLM_COMPRESSOR_SRC)

	from llmcompressor import oneshot # noqa: E402
	from llmcompressor.modifiers.awq import AWQModifier # noqa: E402
	from llmcompressor.modifiers.quantization import ( # noqa: E402
	GPTQModifier,
	QuantizationModifier,
	)

	MODEL_PATH = "/home/quixi/models/Llama-3.2-1B"
	OUTPUT_ROOT = "/home/quixi/models"

	CALIB_DATASET_ID = "HuggingFaceH4/ultrachat_200k"
	CALIB_DATASET_SPLIT = "train_sft"
	NUM_CALIBRATION_SAMPLES = 128
	MAX_SEQUENCE_LENGTH = 512


	def _load_tokenized_dataset(tokenizer):
	ds = load_dataset(
	CALIB_DATASET_ID,
	split=f"{CALIB_DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
	).shuffle(seed=42)

	def preprocess(example):
	return {
	"text": tokenizer.apply_chat_template(
	example["messages"],
	tokenize=False,
	)
	}

	ds = ds.map(preprocess)

	def tokenize(sample):
	return tokenizer(
	sample["text"],
	padding=False,
	max_length=MAX_SEQUENCE_LENGTH,
	truncation=True,
	add_special_tokens=False,
	)

	return ds.map(tokenize, remove_columns=ds.column_names)


	def _load_model_and_tokenizer():
	model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, dtype="auto")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
	if torch.cuda.is_available():
	model.to("cuda")
	return model, tokenizer


	def _cleanup(model, tokenizer):
	del model
	del tokenizer
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()


	def _run_recipe(
	name: str,
	recipe,
	*,
	save_compressed: bool,
	use_calibration: bool,
	) -> Optional[str]:
	print(f"\n=== Quantizing {name} ===")
	model, tokenizer = _load_model_and_tokenizer()

	oneshot_kwargs = {"model": model, "recipe": recipe}
	if use_calibration:
	ds = _load_tokenized_dataset(tokenizer)
	oneshot_kwargs.update(
	dataset=ds,
	max_seq_length=MAX_SEQUENCE_LENGTH,
	num_calibration_samples=NUM_CALIBRATION_SAMPLES,
	)

	oneshot(**oneshot_kwargs)

	base_name = os.path.basename(MODEL_PATH.rstrip("/"))
	save_dir = os.path.join(OUTPUT_ROOT, f"{base_name}-{name}")
	os.makedirs(save_dir, exist_ok=True)

	if save_compressed:
	model.save_pretrained(save_dir, save_compressed=True)
	else:
	model.save_pretrained(save_dir)
	tokenizer.save_pretrained(save_dir)

	_cleanup(model, tokenizer)
	return save_dir


	def main():
	# GPTQ W4A16 (INT4 weight-only).
	_run_recipe(
	"W4A16-GPTQ",
	GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
	save_compressed=True,
	use_calibration=True,
	)

	# AWQ W4A16 (INT4 weight-only).
	_run_recipe(
	"W4A16-AWQ",
	AWQModifier(
	targets=["Linear"],
	scheme="W4A16_ASYM",
	ignore=["lm_head"],
	duo_scaling="both",
	),
	save_compressed=True,
	use_calibration=True,
	)

	# GPTQ W8A16 (INT8 weight-only).
	_run_recipe(
	"W8A16-GPTQ",
	GPTQModifier(targets="Linear", scheme="W8A16", ignore=["lm_head"]),
	save_compressed=True,
	use_calibration=True,
	)

	# FP8 dynamic (W8A8-FP8).
	_run_recipe(
	"FP8-Dynamic",
	QuantizationModifier(targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]),
	save_compressed=False,
	use_calibration=False,
	)

	# NVFP4A16 (FP4 weights + FP16 activations).
	_run_recipe(
	"NVFP4A16",
	QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"]),
	save_compressed=True,
	use_calibration=False,
	)

	# MXFP4 (FP4 weights).
	_run_recipe(
	"MXFP4",
	QuantizationModifier(targets="Linear", scheme="MXFP4", ignore=["lm_head"]),
	save_compressed=True,
	use_calibration=False,
	)


	if __name__ == "__main__":
	main()