alexmarques commited on
Commit
509f3bf
·
verified ·
1 Parent(s): e3a005c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +48 -4
README.md CHANGED
@@ -91,15 +91,24 @@ from llmcompressor.utils import dispatch_for_generation
91
 
92
  MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
93
 
 
 
 
 
 
 
 
 
 
 
94
  # Load model.
95
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
96
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
97
 
98
-
99
  # Configure the quantization algorithm and scheme.
100
  # In this case, we:
101
- # * quantize the weights to fp8 with per channel via ptq
102
- # * quantize the activations to fp8 with dynamic per token
103
  recipe = QuantizationModifier(
104
  targets="Linear", scheme="W8A8", ignore=[
105
  "lm_head",
@@ -109,8 +118,43 @@ recipe = QuantizationModifier(
109
  ],
110
  )
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  # Apply quantization.
113
- oneshot(model=model, recipe=recipe)
 
 
 
 
 
 
114
 
115
  # Confirm generations of the quantized model look sane.
116
  print("========== SAMPLE GENERATION ==============")
 
91
 
92
  MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
93
 
94
+
95
+ # Select calibration dataset.
96
+ DATASET_ID = "garage-bAInd/Open-Platypus"
97
+ DATASET_SPLIT = "train"
98
+
99
+ # Select number of samples. 512 samples is a good place to start.
100
+ # Increasing the number of samples can improve accuracy.
101
+ NUM_CALIBRATION_SAMPLES = 1024
102
+ MAX_SEQUENCE_LENGTH = 8192
103
+
104
  # Load model.
105
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
106
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
107
 
 
108
  # Configure the quantization algorithm and scheme.
109
  # In this case, we:
110
+ # * quantize the weights to int8 with per channel via ptq
111
+ # * quantize the activations to int8 with dynamic per token
112
  recipe = QuantizationModifier(
113
  targets="Linear", scheme="W8A8", ignore=[
114
  "lm_head",
 
118
  ],
119
  )
120
 
121
+ # Load calibration dataset.
122
+ ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
123
+ ds = ds.shuffle(seed=42)
124
+
125
+ def preprocess(example):
126
+ messages = [
127
+ {"role": "user", "content": example["instruction"]},
128
+ {"role": "assistant", "content": example["output"]},
129
+ ]
130
+ return {
131
+ "text": tokenizer.apply_chat_template(
132
+ messages,
133
+ tokenize=False,
134
+ )
135
+ }
136
+
137
+ ds = ds.map(preprocess)
138
+
139
+ def tokenize(sample):
140
+ return tokenizer(
141
+ sample["text"],
142
+ padding=False,
143
+ max_length=MAX_SEQUENCE_LENGTH,
144
+ truncation=True,
145
+ add_special_tokens=False,
146
+ )
147
+
148
+ ds = ds.map(tokenize, remove_columns=ds.column_names)
149
+
150
  # Apply quantization.
151
+ oneshot(
152
+ model=model,
153
+ dataset=ds,
154
+ recipe=recipe,
155
+ max_seq_length=MAX_SEQUENCE_LENGTH,
156
+ num_calibration_samples=NUM_CALIBRATION_SAMPLES,
157
+ )
158
 
159
  # Confirm generations of the quantized model look sane.
160
  print("========== SAMPLE GENERATION ==============")