| | |
| | """ |
| | AutoRound Mixed-Bits Quantization Script (GPU Accelerated) |
| | ========================================================== |
| | Generated by Ansible for: Qwen/Qwen3-Coder-Next |
| | |
| | Strategy (Intel's MoE recipe): |
| | - Default: 4-bit for all layers (including experts) |
| | - Exceptions: 8-bit for attention, gate |
| | - shared_expert_gate: 16-bit (shape not divisible by 32) |
| | - lm_head: original precision (excluded by AutoRound) |
| | |
| | GPU Acceleration: |
| | - iters=50 enables gradient-based tuning on GPU |
| | - For iters>0, lr is auto-set to 1.0/iters (or use lr=5e-3 for iters=50) |
| | - low_gpu_mem_usage=True offloads intermediates to CPU |
| | |
| | Reference: https://huggingface.co/Intel/Qwen3-Next-80B-A3B-Thinking-int4-mixed-AutoRound |
| | """ |
| | import sys |
| | import time |
| | from auto_round import AutoRound |
| |
|
| | |
| | MODEL_NAME = "Qwen/Qwen3-Coder-Next" |
| | OUTPUT_DIR = "/var/lib/inference/models/Qwen3-Coder-Next-int4-mixed-AutoRound" |
| | FORMAT = "auto_round" |
| | EXPERT_BITS = 4 |
| | NON_EXPERT_BITS = 8 |
| | GROUP_SIZE = 128 |
| | SYM = True |
| | ITERS = 50 |
| | LOW_GPU_MEM = True |
| | DEVICE_MAP = "0,1,2" |
| |
|
| | |
| | LR = 5e-3 |
| | |
| | |
| | |
| | |
| |
|
| | SELF_ATTN_LAYERS = [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47] |
| | NUM_LAYERS = 48 |
| |
|
| |
|
| | def build_layer_config(): |
| | """ |
| | Build sparse layer_config with only the 8-bit exceptions. |
| | |
| | Intel's approach: default is 4-bit, only specify layers that need higher precision. |
| | This results in ~240 config entries instead of 74,000+. |
| | |
| | From Intel's quantization_config.json: |
| | - Attention layers (self_attn or linear_attn) -> 8-bit |
| | - mlp.gate (router) -> 8-bit |
| | - mlp.shared_expert_gate -> 16-bit (shape not divisible by 32) |
| | - Everything else (experts, shared_experts) -> 4-bit default |
| | """ |
| | layer_config = {} |
| | |
| | for i in range(NUM_LAYERS): |
| | prefix = f"model.layers.{i}" |
| | |
| | |
| | if i in SELF_ATTN_LAYERS: |
| | |
| | for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]: |
| | layer_config[f"{prefix}.self_attn.{proj}"] = {"bits": NON_EXPERT_BITS} |
| | else: |
| | |
| | for proj in ["in_proj_qkvz", "in_proj_ba", "out_proj"]: |
| | layer_config[f"{prefix}.linear_attn.{proj}"] = {"bits": NON_EXPERT_BITS} |
| | |
| | |
| | layer_config[f"{prefix}.mlp.gate"] = {"bits": NON_EXPERT_BITS} |
| | |
| | |
| | layer_config[f"{prefix}.mlp.shared_expert_gate"] = {"bits": 16} |
| | |
| | return layer_config |
| |
|
| |
|
| | def main(): |
| | start_time = time.time() |
| |
|
| | mode = "GPU-accelerated tuning" if ITERS > 0 else "RTN (CPU-bound)" |
| | |
| | print("=" * 60) |
| | print(f"AutoRound Mixed-Bits Quantization ({mode})") |
| | print("=" * 60) |
| | print(f"Model: {MODEL_NAME}") |
| | print(f"Default bits: {EXPERT_BITS} (experts)") |
| | print(f"Exception bits: {NON_EXPERT_BITS} (attention, gate)") |
| | print(f"Group size: {GROUP_SIZE}") |
| | print(f"Symmetric: {SYM}") |
| | print(f"Iterations: {ITERS}") |
| | print(f"Learning rate: {LR}") |
| | print(f"Low GPU mem: {LOW_GPU_MEM}") |
| | print(f"Device map: {DEVICE_MAP}") |
| | print(f"Output: {OUTPUT_DIR}") |
| | print(f"Format: {FORMAT}") |
| | print("=" * 60) |
| |
|
| | |
| | print("\n[1/2] Building sparse layer configuration...") |
| | layer_config = build_layer_config() |
| | print(f" Config entries: {len(layer_config)} (vs 74,000+ with dense approach)") |
| | |
| | |
| | attn_count = sum(1 for k in layer_config if "attn" in k) |
| | gate_count = sum(1 for k in layer_config if ".mlp.gate" in k) |
| | shared_gate_count = sum(1 for k in layer_config if "shared_expert_gate" in k) |
| | |
| | print(f" Attention layers ({NON_EXPERT_BITS}-bit): {attn_count}") |
| | print(f" MLP gate ({NON_EXPERT_BITS}-bit): {gate_count}") |
| | print(f" Shared expert gate (16-bit): {shared_gate_count}") |
| | print(f" Everything else: {EXPERT_BITS}-bit (experts, shared_experts, etc.)") |
| |
|
| | |
| | |
| | |
| | print(f"\n[2/2] Running AutoRound quantization (iters={ITERS})...") |
| | if ITERS > 0: |
| | print(" GPU-accelerated mode: will use GPU for gradient tuning") |
| | else: |
| | print(" RTN mode: CPU-bound optimized rounding") |
| | print(" Note: AutoRound will load the model internally") |
| | |
| | autoround = AutoRound( |
| | MODEL_NAME, |
| | bits=EXPERT_BITS, |
| | group_size=GROUP_SIZE, |
| | sym=SYM, |
| | iters=ITERS, |
| | lr=LR, |
| | layer_config=layer_config, |
| | |
| | device_map=DEVICE_MAP, |
| | low_gpu_mem_usage=LOW_GPU_MEM, |
| | |
| | low_cpu_mem_usage=True, |
| | ) |
| |
|
| | autoround.quantize_and_save(format=FORMAT, output_dir=OUTPUT_DIR) |
| |
|
| | elapsed = time.time() - start_time |
| | hours = int(elapsed // 3600) |
| | minutes = int((elapsed % 3600) // 60) |
| | seconds = int(elapsed % 60) |
| |
|
| | print(f"\n{'=' * 60}") |
| | print("Quantization complete!") |
| | print(f" Output: {OUTPUT_DIR}") |
| | if hours > 0: |
| | print(f" Time: {hours}h {minutes}m {seconds}s") |
| | else: |
| | print(f" Time: {minutes}m {seconds}s") |
| | print("=" * 60) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|