| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot | |
| from llmcompressor.transformers.compression.helpers import ( | |
| calculate_offload_device_map, | |
| ) | |
| recipe = """ | |
| quant_stage: | |
| quant_modifiers: | |
| QuantizationModifier: | |
| ignore: ["lm_head"] | |
| config_groups: | |
| group_0: | |
| weights: | |
| num_bits: 8 | |
| type: float | |
| strategy: tensor | |
| dynamic: false | |
| symmetric: true | |
| input_activations: | |
| num_bits: 8 | |
| type: float | |
| strategy: tensor | |
| dynamic: false | |
| symmetric: true | |
| targets: ["Linear"] | |
| """ | |
| model_stub = "southfreebird/Qwen2.5-1.5B-Instruct" | |
| model_name = model_stub.split("/")[-1] | |
| device_map = calculate_offload_device_map( | |
| model_stub, reserve_for_hessians=False, num_gpus=2, torch_dtype="auto" | |
| ) | |
| model = SparseAutoModelForCausalLM.from_pretrained( | |
| model_stub, torch_dtype="auto", device_map=device_map | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(model_stub) | |
| output_dir = f"/{model_name}-FP8" | |
| DATASET_ID = "neuralmagic/LLM_compression_calibration" | |
| DATASET_SPLIT = "train" | |
| NUM_CALIBRATION_SAMPLES = 512 | |
| MAX_SEQUENCE_LENGTH = 8192 | |
| def preprocess_fn(example): | |
| return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)} | |
| ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) | |
| ds = ds.shuffle().select(range(NUM_CALIBRATION_SAMPLES)) | |
| ds = ds.map(preprocess_fn) | |
| oneshot( | |
| model=model, | |
| output_dir=output_dir, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=NUM_CALIBRATION_SAMPLES, | |
| save_compressed=True, | |
| ) | |