File size: 1,951 Bytes
4e752e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from datasets import load_dataset
from transformers import AutoTokenizer
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers.compression.helpers import (
calculate_offload_device_map,
)
recipe = """
quant_stage:
quant_modifiers:
QuantizationModifier:
ignore: ["lm_head"]
config_groups:
group_0:
weights:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
input_activations:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
targets: ["Linear"]
"""
model_stub = "southfreebird/Qwen2.5-Coder-0.5B-Instruct"
model_name = model_stub.split("/")[-1]
device_map = calculate_offload_device_map(
model_stub, reserve_for_hessians=False, num_gpus=2, torch_dtype="auto"
)
model = SparseAutoModelForCausalLM.from_pretrained(
model_stub, torch_dtype="auto", device_map=device_map
)
tokenizer = AutoTokenizer.from_pretrained(model_stub)
output_dir = f"/{model_name}-FP8"
DATASET_ID = "neuralmagic/LLM_compression_calibration"
DATASET_SPLIT = "train"
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 8192
def preprocess_fn(example):
return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle().select(range(NUM_CALIBRATION_SAMPLES))
ds = ds.map(preprocess_fn)
oneshot(
model=model,
output_dir=output_dir,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
save_compressed=True,
)
|