#!/usr/bin/env python3 """ AutoRound Mixed-Bits Quantization Script (GPU Accelerated) ========================================================== Generated by Ansible for: Qwen/Qwen3-Coder-Next Strategy (Intel's MoE recipe): - Default: 4-bit for all layers (including experts) - Exceptions: 8-bit for attention, gate - shared_expert_gate: 16-bit (shape not divisible by 32) - lm_head: original precision (excluded by AutoRound) GPU Acceleration: - iters=50 enables gradient-based tuning on GPU - For iters>0, lr is auto-set to 1.0/iters (or use lr=5e-3 for iters=50) - low_gpu_mem_usage=True offloads intermediates to CPU Reference: https://huggingface.co/Intel/Qwen3-Next-80B-A3B-Thinking-int4-mixed-AutoRound """ import sys import time from auto_round import AutoRound # Configuration (injected by Ansible) MODEL_NAME = "Qwen/Qwen3-Coder-Next" OUTPUT_DIR = "/var/lib/inference/models/Qwen3-Coder-Next-int4-mixed-AutoRound" FORMAT = "auto_round" EXPERT_BITS = 4 NON_EXPERT_BITS = 8 GROUP_SIZE = 128 SYM = True ITERS = 50 LOW_GPU_MEM = True DEVICE_MAP = "0,1,2" # Learning rate: use 5e-3 for iters=50, None (auto) for iters>=200 LR = 5e-3 # Qwen3-Next has 48 layers with mixed attention types: # - Layers 3,7,11,15,19,23,27,31,35,39,43,47: self_attn (q_proj, k_proj, v_proj, o_proj) # - Other layers: linear_attn (in_proj_qkvz, in_proj_ba, out_proj) # All layers have: mlp.gate, mlp.shared_expert_gate, mlp.shared_experts.*, mlp.experts.* SELF_ATTN_LAYERS = [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47] NUM_LAYERS = 48 def build_layer_config(): """ Build sparse layer_config with only the 8-bit exceptions. Intel's approach: default is 4-bit, only specify layers that need higher precision. This results in ~240 config entries instead of 74,000+. From Intel's quantization_config.json: - Attention layers (self_attn or linear_attn) -> 8-bit - mlp.gate (router) -> 8-bit - mlp.shared_expert_gate -> 16-bit (shape not divisible by 32) - Everything else (experts, shared_experts) -> 4-bit default """ layer_config = {} for i in range(NUM_LAYERS): prefix = f"model.layers.{i}" # Attention layers -> 8-bit if i in SELF_ATTN_LAYERS: # Standard self-attention for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]: layer_config[f"{prefix}.self_attn.{proj}"] = {"bits": NON_EXPERT_BITS} else: # Linear attention for proj in ["in_proj_qkvz", "in_proj_ba", "out_proj"]: layer_config[f"{prefix}.linear_attn.{proj}"] = {"bits": NON_EXPERT_BITS} # MLP gate -> 8-bit (router) layer_config[f"{prefix}.mlp.gate"] = {"bits": NON_EXPERT_BITS} # shared_expert_gate -> 16-bit (shape not divisible by 32, will be skipped by AutoRound) layer_config[f"{prefix}.mlp.shared_expert_gate"] = {"bits": 16} return layer_config def main(): start_time = time.time() mode = "GPU-accelerated tuning" if ITERS > 0 else "RTN (CPU-bound)" print("=" * 60) print(f"AutoRound Mixed-Bits Quantization ({mode})") print("=" * 60) print(f"Model: {MODEL_NAME}") print(f"Default bits: {EXPERT_BITS} (experts)") print(f"Exception bits: {NON_EXPERT_BITS} (attention, gate)") print(f"Group size: {GROUP_SIZE}") print(f"Symmetric: {SYM}") print(f"Iterations: {ITERS}") print(f"Learning rate: {LR}") print(f"Low GPU mem: {LOW_GPU_MEM}") print(f"Device map: {DEVICE_MAP}") print(f"Output: {OUTPUT_DIR}") print(f"Format: {FORMAT}") print("=" * 60) # Build sparse layer config (only exceptions) print("\n[1/2] Building sparse layer configuration...") layer_config = build_layer_config() print(f" Config entries: {len(layer_config)} (vs 74,000+ with dense approach)") # Count by type attn_count = sum(1 for k in layer_config if "attn" in k) gate_count = sum(1 for k in layer_config if ".mlp.gate" in k) shared_gate_count = sum(1 for k in layer_config if "shared_expert_gate" in k) print(f" Attention layers ({NON_EXPERT_BITS}-bit): {attn_count}") print(f" MLP gate ({NON_EXPERT_BITS}-bit): {gate_count}") print(f" Shared expert gate (16-bit): {shared_gate_count}") print(f" Everything else: {EXPERT_BITS}-bit (experts, shared_experts, etc.)") # Run quantization # Key: pass model NAME as string, not loaded model object # AutoRound will load the model internally with proper device mapping print(f"\n[2/2] Running AutoRound quantization (iters={ITERS})...") if ITERS > 0: print(" GPU-accelerated mode: will use GPU for gradient tuning") else: print(" RTN mode: CPU-bound optimized rounding") print(" Note: AutoRound will load the model internally") autoround = AutoRound( MODEL_NAME, # String, not model object - Intel's approach bits=EXPERT_BITS, # Default bits for all layers group_size=GROUP_SIZE, sym=SYM, iters=ITERS, lr=LR, # Learning rate for tuning layer_config=layer_config, # Sparse config with exceptions only # GPU settings - use device_map (not device, which is deprecated) device_map=DEVICE_MAP, low_gpu_mem_usage=LOW_GPU_MEM, # CPU memory optimization low_cpu_mem_usage=True, ) autoround.quantize_and_save(format=FORMAT, output_dir=OUTPUT_DIR) elapsed = time.time() - start_time hours = int(elapsed // 3600) minutes = int((elapsed % 3600) // 60) seconds = int(elapsed % 60) print(f"\n{'=' * 60}") print("Quantization complete!") print(f" Output: {OUTPUT_DIR}") if hours > 0: print(f" Time: {hours}h {minutes}m {seconds}s") else: print(f" Time: {minutes}m {seconds}s") print("=" * 60) if __name__ == "__main__": main()