File size: 6,016 Bytes

a87cd2c

#!/usr/bin/env python3
"""
AutoRound Mixed-Bits Quantization Script (GPU Accelerated)
==========================================================
Generated by Ansible for: Qwen/Qwen3-Coder-Next

Strategy (Intel's MoE recipe):
  - Default: 4-bit for all layers (including experts)
  - Exceptions: 8-bit for attention, gate
  - shared_expert_gate: 16-bit (shape not divisible by 32)
  - lm_head: original precision (excluded by AutoRound)

GPU Acceleration:
  - iters=50 enables gradient-based tuning on GPU
  - For iters>0, lr is auto-set to 1.0/iters (or use lr=5e-3 for iters=50)
  - low_gpu_mem_usage=True offloads intermediates to CPU

Reference: https://huggingface.co/Intel/Qwen3-Next-80B-A3B-Thinking-int4-mixed-AutoRound
"""
import sys
import time
from auto_round import AutoRound

# Configuration (injected by Ansible)
MODEL_NAME = "Qwen/Qwen3-Coder-Next"
OUTPUT_DIR = "/var/lib/inference/models/Qwen3-Coder-Next-int4-mixed-AutoRound"
FORMAT = "auto_round"
EXPERT_BITS = 4
NON_EXPERT_BITS = 8
GROUP_SIZE = 128
SYM = True
ITERS = 50
LOW_GPU_MEM = True
DEVICE_MAP = "0,1,2"

# Learning rate: use 5e-3 for iters=50, None (auto) for iters>=200
LR = 5e-3
# Qwen3-Next has 48 layers with mixed attention types:
# - Layers 3,7,11,15,19,23,27,31,35,39,43,47: self_attn (q_proj, k_proj, v_proj, o_proj)
# - Other layers: linear_attn (in_proj_qkvz, in_proj_ba, out_proj)
# All layers have: mlp.gate, mlp.shared_expert_gate, mlp.shared_experts.*, mlp.experts.*

SELF_ATTN_LAYERS = [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47]
NUM_LAYERS = 48


def build_layer_config():
    """
    Build sparse layer_config with only the 8-bit exceptions.
    
    Intel's approach: default is 4-bit, only specify layers that need higher precision.
    This results in ~240 config entries instead of 74,000+.
    
    From Intel's quantization_config.json:
    - Attention layers (self_attn or linear_attn) -> 8-bit
    - mlp.gate (router) -> 8-bit  
    - mlp.shared_expert_gate -> 16-bit (shape not divisible by 32)
    - Everything else (experts, shared_experts) -> 4-bit default
    """
    layer_config = {}
    
    for i in range(NUM_LAYERS):
        prefix = f"model.layers.{i}"
        
        # Attention layers -> 8-bit
        if i in SELF_ATTN_LAYERS:
            # Standard self-attention
            for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
                layer_config[f"{prefix}.self_attn.{proj}"] = {"bits": NON_EXPERT_BITS}
        else:
            # Linear attention
            for proj in ["in_proj_qkvz", "in_proj_ba", "out_proj"]:
                layer_config[f"{prefix}.linear_attn.{proj}"] = {"bits": NON_EXPERT_BITS}
        
        # MLP gate -> 8-bit (router)
        layer_config[f"{prefix}.mlp.gate"] = {"bits": NON_EXPERT_BITS}
        
        # shared_expert_gate -> 16-bit (shape not divisible by 32, will be skipped by AutoRound)
        layer_config[f"{prefix}.mlp.shared_expert_gate"] = {"bits": 16}
    
    return layer_config


def main():
    start_time = time.time()

    mode = "GPU-accelerated tuning" if ITERS > 0 else "RTN (CPU-bound)"
    
    print("=" * 60)
    print(f"AutoRound Mixed-Bits Quantization ({mode})")
    print("=" * 60)
    print(f"Model:           {MODEL_NAME}")
    print(f"Default bits:    {EXPERT_BITS} (experts)")
    print(f"Exception bits:  {NON_EXPERT_BITS} (attention, gate)")
    print(f"Group size:      {GROUP_SIZE}")
    print(f"Symmetric:       {SYM}")
    print(f"Iterations:      {ITERS}")
    print(f"Learning rate:   {LR}")
    print(f"Low GPU mem:     {LOW_GPU_MEM}")
    print(f"Device map:      {DEVICE_MAP}")
    print(f"Output:          {OUTPUT_DIR}")
    print(f"Format:          {FORMAT}")
    print("=" * 60)

    # Build sparse layer config (only exceptions)
    print("\n[1/2] Building sparse layer configuration...")
    layer_config = build_layer_config()
    print(f"  Config entries: {len(layer_config)} (vs 74,000+ with dense approach)")
    
    # Count by type
    attn_count = sum(1 for k in layer_config if "attn" in k)
    gate_count = sum(1 for k in layer_config if ".mlp.gate" in k)
    shared_gate_count = sum(1 for k in layer_config if "shared_expert_gate" in k)
    
    print(f"  Attention layers ({NON_EXPERT_BITS}-bit):      {attn_count}")
    print(f"  MLP gate ({NON_EXPERT_BITS}-bit):              {gate_count}")
    print(f"  Shared expert gate (16-bit):   {shared_gate_count}")
    print(f"  Everything else:               {EXPERT_BITS}-bit (experts, shared_experts, etc.)")

    # Run quantization
    # Key: pass model NAME as string, not loaded model object
    # AutoRound will load the model internally with proper device mapping
    print(f"\n[2/2] Running AutoRound quantization (iters={ITERS})...")
    if ITERS > 0:
        print("  GPU-accelerated mode: will use GPU for gradient tuning")
    else:
        print("  RTN mode: CPU-bound optimized rounding")
    print("  Note: AutoRound will load the model internally")
    
    autoround = AutoRound(
        MODEL_NAME,  # String, not model object - Intel's approach
        bits=EXPERT_BITS,  # Default bits for all layers
        group_size=GROUP_SIZE,
        sym=SYM,
        iters=ITERS,
        lr=LR,  # Learning rate for tuning
        layer_config=layer_config,  # Sparse config with exceptions only
        # GPU settings - use device_map (not device, which is deprecated)
        device_map=DEVICE_MAP,
        low_gpu_mem_usage=LOW_GPU_MEM,
        # CPU memory optimization
        low_cpu_mem_usage=True,
    )

    autoround.quantize_and_save(format=FORMAT, output_dir=OUTPUT_DIR)

    elapsed = time.time() - start_time
    hours = int(elapsed // 3600)
    minutes = int((elapsed % 3600) // 60)
    seconds = int(elapsed % 60)

    print(f"\n{'=' * 60}")
    print("Quantization complete!")
    print(f"  Output: {OUTPUT_DIR}")
    if hours > 0:
        print(f"  Time:   {hours}h {minutes}m {seconds}s")
    else:
        print(f"  Time:   {minutes}m {seconds}s")
    print("=" * 60)


if __name__ == "__main__":
    main()