Update README.md
Browse files
README.md
CHANGED
|
@@ -91,15 +91,24 @@ from llmcompressor.utils import dispatch_for_generation
|
|
| 91 |
|
| 92 |
MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
# Load model.
|
| 95 |
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
|
| 96 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 97 |
|
| 98 |
-
|
| 99 |
# Configure the quantization algorithm and scheme.
|
| 100 |
# In this case, we:
|
| 101 |
-
# * quantize the weights to
|
| 102 |
-
# * quantize the activations to
|
| 103 |
recipe = QuantizationModifier(
|
| 104 |
targets="Linear", scheme="W8A8", ignore=[
|
| 105 |
"lm_head",
|
|
@@ -109,8 +118,43 @@ recipe = QuantizationModifier(
|
|
| 109 |
],
|
| 110 |
)
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
# Apply quantization.
|
| 113 |
-
oneshot(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
# Confirm generations of the quantized model look sane.
|
| 116 |
print("========== SAMPLE GENERATION ==============")
|
|
|
|
| 91 |
|
| 92 |
MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
|
| 93 |
|
| 94 |
+
|
| 95 |
+
# Select calibration dataset.
|
| 96 |
+
DATASET_ID = "garage-bAInd/Open-Platypus"
|
| 97 |
+
DATASET_SPLIT = "train"
|
| 98 |
+
|
| 99 |
+
# Select number of samples. 512 samples is a good place to start.
|
| 100 |
+
# Increasing the number of samples can improve accuracy.
|
| 101 |
+
NUM_CALIBRATION_SAMPLES = 1024
|
| 102 |
+
MAX_SEQUENCE_LENGTH = 8192
|
| 103 |
+
|
| 104 |
# Load model.
|
| 105 |
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
|
| 106 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 107 |
|
|
|
|
| 108 |
# Configure the quantization algorithm and scheme.
|
| 109 |
# In this case, we:
|
| 110 |
+
# * quantize the weights to int8 with per channel via ptq
|
| 111 |
+
# * quantize the activations to int8 with dynamic per token
|
| 112 |
recipe = QuantizationModifier(
|
| 113 |
targets="Linear", scheme="W8A8", ignore=[
|
| 114 |
"lm_head",
|
|
|
|
| 118 |
],
|
| 119 |
)
|
| 120 |
|
| 121 |
+
# Load calibration dataset.
|
| 122 |
+
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
|
| 123 |
+
ds = ds.shuffle(seed=42)
|
| 124 |
+
|
| 125 |
+
def preprocess(example):
|
| 126 |
+
messages = [
|
| 127 |
+
{"role": "user", "content": example["instruction"]},
|
| 128 |
+
{"role": "assistant", "content": example["output"]},
|
| 129 |
+
]
|
| 130 |
+
return {
|
| 131 |
+
"text": tokenizer.apply_chat_template(
|
| 132 |
+
messages,
|
| 133 |
+
tokenize=False,
|
| 134 |
+
)
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
ds = ds.map(preprocess)
|
| 138 |
+
|
| 139 |
+
def tokenize(sample):
|
| 140 |
+
return tokenizer(
|
| 141 |
+
sample["text"],
|
| 142 |
+
padding=False,
|
| 143 |
+
max_length=MAX_SEQUENCE_LENGTH,
|
| 144 |
+
truncation=True,
|
| 145 |
+
add_special_tokens=False,
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
| 149 |
+
|
| 150 |
# Apply quantization.
|
| 151 |
+
oneshot(
|
| 152 |
+
model=model,
|
| 153 |
+
dataset=ds,
|
| 154 |
+
recipe=recipe,
|
| 155 |
+
max_seq_length=MAX_SEQUENCE_LENGTH,
|
| 156 |
+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
| 157 |
+
)
|
| 158 |
|
| 159 |
# Confirm generations of the quantized model look sane.
|
| 160 |
print("========== SAMPLE GENERATION ==============")
|