File size: 6,016 Bytes
a87cd2c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | #!/usr/bin/env python3
"""
AutoRound Mixed-Bits Quantization Script (GPU Accelerated)
==========================================================
Generated by Ansible for: Qwen/Qwen3-Coder-Next
Strategy (Intel's MoE recipe):
- Default: 4-bit for all layers (including experts)
- Exceptions: 8-bit for attention, gate
- shared_expert_gate: 16-bit (shape not divisible by 32)
- lm_head: original precision (excluded by AutoRound)
GPU Acceleration:
- iters=50 enables gradient-based tuning on GPU
- For iters>0, lr is auto-set to 1.0/iters (or use lr=5e-3 for iters=50)
- low_gpu_mem_usage=True offloads intermediates to CPU
Reference: https://huggingface.co/Intel/Qwen3-Next-80B-A3B-Thinking-int4-mixed-AutoRound
"""
import sys
import time
from auto_round import AutoRound
# Configuration (injected by Ansible)
MODEL_NAME = "Qwen/Qwen3-Coder-Next"
OUTPUT_DIR = "/var/lib/inference/models/Qwen3-Coder-Next-int4-mixed-AutoRound"
FORMAT = "auto_round"
EXPERT_BITS = 4
NON_EXPERT_BITS = 8
GROUP_SIZE = 128
SYM = True
ITERS = 50
LOW_GPU_MEM = True
DEVICE_MAP = "0,1,2"
# Learning rate: use 5e-3 for iters=50, None (auto) for iters>=200
LR = 5e-3
# Qwen3-Next has 48 layers with mixed attention types:
# - Layers 3,7,11,15,19,23,27,31,35,39,43,47: self_attn (q_proj, k_proj, v_proj, o_proj)
# - Other layers: linear_attn (in_proj_qkvz, in_proj_ba, out_proj)
# All layers have: mlp.gate, mlp.shared_expert_gate, mlp.shared_experts.*, mlp.experts.*
SELF_ATTN_LAYERS = [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47]
NUM_LAYERS = 48
def build_layer_config():
"""
Build sparse layer_config with only the 8-bit exceptions.
Intel's approach: default is 4-bit, only specify layers that need higher precision.
This results in ~240 config entries instead of 74,000+.
From Intel's quantization_config.json:
- Attention layers (self_attn or linear_attn) -> 8-bit
- mlp.gate (router) -> 8-bit
- mlp.shared_expert_gate -> 16-bit (shape not divisible by 32)
- Everything else (experts, shared_experts) -> 4-bit default
"""
layer_config = {}
for i in range(NUM_LAYERS):
prefix = f"model.layers.{i}"
# Attention layers -> 8-bit
if i in SELF_ATTN_LAYERS:
# Standard self-attention
for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
layer_config[f"{prefix}.self_attn.{proj}"] = {"bits": NON_EXPERT_BITS}
else:
# Linear attention
for proj in ["in_proj_qkvz", "in_proj_ba", "out_proj"]:
layer_config[f"{prefix}.linear_attn.{proj}"] = {"bits": NON_EXPERT_BITS}
# MLP gate -> 8-bit (router)
layer_config[f"{prefix}.mlp.gate"] = {"bits": NON_EXPERT_BITS}
# shared_expert_gate -> 16-bit (shape not divisible by 32, will be skipped by AutoRound)
layer_config[f"{prefix}.mlp.shared_expert_gate"] = {"bits": 16}
return layer_config
def main():
start_time = time.time()
mode = "GPU-accelerated tuning" if ITERS > 0 else "RTN (CPU-bound)"
print("=" * 60)
print(f"AutoRound Mixed-Bits Quantization ({mode})")
print("=" * 60)
print(f"Model: {MODEL_NAME}")
print(f"Default bits: {EXPERT_BITS} (experts)")
print(f"Exception bits: {NON_EXPERT_BITS} (attention, gate)")
print(f"Group size: {GROUP_SIZE}")
print(f"Symmetric: {SYM}")
print(f"Iterations: {ITERS}")
print(f"Learning rate: {LR}")
print(f"Low GPU mem: {LOW_GPU_MEM}")
print(f"Device map: {DEVICE_MAP}")
print(f"Output: {OUTPUT_DIR}")
print(f"Format: {FORMAT}")
print("=" * 60)
# Build sparse layer config (only exceptions)
print("\n[1/2] Building sparse layer configuration...")
layer_config = build_layer_config()
print(f" Config entries: {len(layer_config)} (vs 74,000+ with dense approach)")
# Count by type
attn_count = sum(1 for k in layer_config if "attn" in k)
gate_count = sum(1 for k in layer_config if ".mlp.gate" in k)
shared_gate_count = sum(1 for k in layer_config if "shared_expert_gate" in k)
print(f" Attention layers ({NON_EXPERT_BITS}-bit): {attn_count}")
print(f" MLP gate ({NON_EXPERT_BITS}-bit): {gate_count}")
print(f" Shared expert gate (16-bit): {shared_gate_count}")
print(f" Everything else: {EXPERT_BITS}-bit (experts, shared_experts, etc.)")
# Run quantization
# Key: pass model NAME as string, not loaded model object
# AutoRound will load the model internally with proper device mapping
print(f"\n[2/2] Running AutoRound quantization (iters={ITERS})...")
if ITERS > 0:
print(" GPU-accelerated mode: will use GPU for gradient tuning")
else:
print(" RTN mode: CPU-bound optimized rounding")
print(" Note: AutoRound will load the model internally")
autoround = AutoRound(
MODEL_NAME, # String, not model object - Intel's approach
bits=EXPERT_BITS, # Default bits for all layers
group_size=GROUP_SIZE,
sym=SYM,
iters=ITERS,
lr=LR, # Learning rate for tuning
layer_config=layer_config, # Sparse config with exceptions only
# GPU settings - use device_map (not device, which is deprecated)
device_map=DEVICE_MAP,
low_gpu_mem_usage=LOW_GPU_MEM,
# CPU memory optimization
low_cpu_mem_usage=True,
)
autoround.quantize_and_save(format=FORMAT, output_dir=OUTPUT_DIR)
elapsed = time.time() - start_time
hours = int(elapsed // 3600)
minutes = int((elapsed % 3600) // 60)
seconds = int(elapsed % 60)
print(f"\n{'=' * 60}")
print("Quantization complete!")
print(f" Output: {OUTPUT_DIR}")
if hours > 0:
print(f" Time: {hours}h {minutes}m {seconds}s")
else:
print(f" Time: {minutes}m {seconds}s")
print("=" * 60)
if __name__ == "__main__":
main()
|