RedHatAI
/

Qwen3-Next-80B-A3B-Instruct-quantized.w8a8

Text Generation

compressed-tensors

8-bit precision

Model card Files Files and versions

alexmarques commited on Mar 3

Commit

509f3bf

·

verified ·

1 Parent(s): e3a005c

Update README.md

Files changed (1) hide show

README.md +48 -4

README.md CHANGED Viewed

@@ -91,15 +91,24 @@ from llmcompressor.utils import dispatch_for_generation
 MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
 # Load model.
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 # Configure the quantization algorithm and scheme.
 # In this case, we:
-#   * quantize the weights to fp8 with per channel via ptq
-#   * quantize the activations to fp8 with dynamic per token
 recipe = QuantizationModifier(
     targets="Linear", scheme="W8A8", ignore=[
         "lm_head",
@@ -109,8 +118,43 @@ recipe = QuantizationModifier(
     ],
 )
 # Apply quantization.
-oneshot(model=model, recipe=recipe)
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")

 MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
+# Select calibration dataset.
+DATASET_ID = "garage-bAInd/Open-Platypus"
+DATASET_SPLIT = "train"
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 1024
+MAX_SEQUENCE_LENGTH = 8192
 # Load model.
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 # Configure the quantization algorithm and scheme.
 # In this case, we:
+#   * quantize the weights to int8 with per channel via ptq
+#   * quantize the activations to int8 with dynamic per token
 recipe = QuantizationModifier(
     targets="Linear", scheme="W8A8", ignore=[
         "lm_head",
     ],
 )
+# Load calibration dataset.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+def preprocess(example):
+    messages = [
+        {"role": "user", "content": example["instruction"]},
+        {"role": "assistant", "content": example["output"]},
+    ]
+    return {
+        "text": tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+        )
+    }
+ds = ds.map(preprocess)
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+ds = ds.map(tokenize, remove_columns=ds.column_names)
 # Apply quantization.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")