| import torch | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot | |
| from llmcompressor.transformers.compression.helpers import calculate_offload_device_map | |
| import gc | |
| torch.cuda.empty_cache() | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| torch.backends.cudnn.allow_tf32 = True | |
| recipe = """ | |
| quant_stage: | |
| quant_modifiers: | |
| QuantizationModifier: | |
| ignore: ["lm_head"] | |
| config_groups: | |
| group_0: | |
| weights: | |
| num_bits: 8 | |
| type: float | |
| strategy: tensor | |
| dynamic: false | |
| symmetric: true | |
| input_activations: | |
| num_bits: 8 | |
| type: float | |
| strategy: tensor | |
| dynamic: false | |
| symmetric: true | |
| targets: ["Linear"] | |
| """ | |
| model_stub = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B" | |
| model_name = model_stub.split("/")[-1] | |
| device_map = calculate_offload_device_map( | |
| model_stub, | |
| reserve_for_hessians=True, | |
| num_gpus=1, | |
| torch_dtype=torch.bfloat16, | |
| max_memory={0: "18GiB", "cpu": "96GiB"} | |
| ) | |
| model = SparseAutoModelForCausalLM.from_pretrained( | |
| model_stub, | |
| torch_dtype=torch.bfloat16, | |
| device_map=device_map, | |
| low_cpu_mem_usage=True, | |
| offload_folder="offload_folder", | |
| offload_state_dict=True | |
| ) | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| tokenizer = AutoTokenizer.from_pretrained(model_stub) | |
| output_dir = f"./{model_name}-FP8" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 2048 | |
| raw_dataset = load_dataset( | |
| "HuggingFaceH4/ultrachat_200k", | |
| split="train_sft" | |
| ) | |
| raw_dataset = raw_dataset.select(range(min(NUM_CALIBRATION_SAMPLES, len(raw_dataset)))) | |
| def preprocess_function(examples): | |
| texts = [tokenizer.apply_chat_template(messages, tokenize=False) | |
| for messages in examples["messages"]] | |
| tokenized = tokenizer( | |
| texts, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| padding="max_length", | |
| truncation=True, | |
| return_tensors="pt" | |
| ) | |
| tokenized["labels"] = tokenized["input_ids"].clone() | |
| return tokenized | |
| processed_dataset = raw_dataset.map( | |
| preprocess_function, | |
| batched=True, | |
| remove_columns=raw_dataset.column_names, | |
| desc="Processing dataset", | |
| ) | |
| oneshot( | |
| model=model, | |
| output_dir=output_dir, | |
| dataset=processed_dataset, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=NUM_CALIBRATION_SAMPLES, | |
| per_device_train_batch_size=1, | |
| gradient_accumulation_steps=4, | |
| fp16=False, | |
| bf16=True, | |
| save_compressed=True, | |
| learning_rate=1e-5, | |
| num_train_epochs=1, | |
| logging_steps=10, | |
| save_strategy="no", | |
| remove_unused_columns=False, | |
| push_to_hub=False, | |
| preprocessing_num_workers=4, | |
| dataloader_num_workers=2 | |
| ) | |