from datasets import load_dataset from transformers import AutoTokenizer from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot from llmcompressor.transformers.compression.helpers import ( calculate_offload_device_map, ) recipe = """ quant_stage: quant_modifiers: QuantizationModifier: ignore: ["lm_head"] config_groups: group_0: weights: num_bits: 8 type: float strategy: tensor dynamic: false symmetric: true input_activations: num_bits: 8 type: float strategy: tensor dynamic: false symmetric: true targets: ["Linear"] """ model_stub = "southfreebird/Qwen2.5-0.5B-Instruct" model_name = model_stub.split("/")[-1] device_map = calculate_offload_device_map( model_stub, reserve_for_hessians=False, num_gpus=2, torch_dtype="auto" ) model = SparseAutoModelForCausalLM.from_pretrained( model_stub, torch_dtype="auto", device_map=device_map ) tokenizer = AutoTokenizer.from_pretrained(model_stub) output_dir = f"/{model_name}-FP8" DATASET_ID = "neuralmagic/LLM_compression_calibration" DATASET_SPLIT = "train" NUM_CALIBRATION_SAMPLES = 512 MAX_SEQUENCE_LENGTH = 8192 def preprocess_fn(example): return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)} ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) ds = ds.shuffle().select(range(NUM_CALIBRATION_SAMPLES)) ds = ds.map(preprocess_fn) oneshot( model=model, output_dir=output_dir, dataset=ds, recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, save_compressed=True, )