File size: 6,016 Bytes
a87cd2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python3
"""
AutoRound Mixed-Bits Quantization Script (GPU Accelerated)
==========================================================
Generated by Ansible for: Qwen/Qwen3-Coder-Next

Strategy (Intel's MoE recipe):
  - Default: 4-bit for all layers (including experts)
  - Exceptions: 8-bit for attention, gate
  - shared_expert_gate: 16-bit (shape not divisible by 32)
  - lm_head: original precision (excluded by AutoRound)

GPU Acceleration:
  - iters=50 enables gradient-based tuning on GPU
  - For iters>0, lr is auto-set to 1.0/iters (or use lr=5e-3 for iters=50)
  - low_gpu_mem_usage=True offloads intermediates to CPU

Reference: https://huggingface.co/Intel/Qwen3-Next-80B-A3B-Thinking-int4-mixed-AutoRound
"""
import sys
import time
from auto_round import AutoRound

# Configuration (injected by Ansible)
MODEL_NAME = "Qwen/Qwen3-Coder-Next"
OUTPUT_DIR = "/var/lib/inference/models/Qwen3-Coder-Next-int4-mixed-AutoRound"
FORMAT = "auto_round"
EXPERT_BITS = 4
NON_EXPERT_BITS = 8
GROUP_SIZE = 128
SYM = True
ITERS = 50
LOW_GPU_MEM = True
DEVICE_MAP = "0,1,2"

# Learning rate: use 5e-3 for iters=50, None (auto) for iters>=200
LR = 5e-3
# Qwen3-Next has 48 layers with mixed attention types:
# - Layers 3,7,11,15,19,23,27,31,35,39,43,47: self_attn (q_proj, k_proj, v_proj, o_proj)
# - Other layers: linear_attn (in_proj_qkvz, in_proj_ba, out_proj)
# All layers have: mlp.gate, mlp.shared_expert_gate, mlp.shared_experts.*, mlp.experts.*

SELF_ATTN_LAYERS = [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47]
NUM_LAYERS = 48


def build_layer_config():
    """
    Build sparse layer_config with only the 8-bit exceptions.
    
    Intel's approach: default is 4-bit, only specify layers that need higher precision.
    This results in ~240 config entries instead of 74,000+.
    
    From Intel's quantization_config.json:
    - Attention layers (self_attn or linear_attn) -> 8-bit
    - mlp.gate (router) -> 8-bit  
    - mlp.shared_expert_gate -> 16-bit (shape not divisible by 32)
    - Everything else (experts, shared_experts) -> 4-bit default
    """
    layer_config = {}
    
    for i in range(NUM_LAYERS):
        prefix = f"model.layers.{i}"
        
        # Attention layers -> 8-bit
        if i in SELF_ATTN_LAYERS:
            # Standard self-attention
            for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
                layer_config[f"{prefix}.self_attn.{proj}"] = {"bits": NON_EXPERT_BITS}
        else:
            # Linear attention
            for proj in ["in_proj_qkvz", "in_proj_ba", "out_proj"]:
                layer_config[f"{prefix}.linear_attn.{proj}"] = {"bits": NON_EXPERT_BITS}
        
        # MLP gate -> 8-bit (router)
        layer_config[f"{prefix}.mlp.gate"] = {"bits": NON_EXPERT_BITS}
        
        # shared_expert_gate -> 16-bit (shape not divisible by 32, will be skipped by AutoRound)
        layer_config[f"{prefix}.mlp.shared_expert_gate"] = {"bits": 16}
    
    return layer_config


def main():
    start_time = time.time()

    mode = "GPU-accelerated tuning" if ITERS > 0 else "RTN (CPU-bound)"
    
    print("=" * 60)
    print(f"AutoRound Mixed-Bits Quantization ({mode})")
    print("=" * 60)
    print(f"Model:           {MODEL_NAME}")
    print(f"Default bits:    {EXPERT_BITS} (experts)")
    print(f"Exception bits:  {NON_EXPERT_BITS} (attention, gate)")
    print(f"Group size:      {GROUP_SIZE}")
    print(f"Symmetric:       {SYM}")
    print(f"Iterations:      {ITERS}")
    print(f"Learning rate:   {LR}")
    print(f"Low GPU mem:     {LOW_GPU_MEM}")
    print(f"Device map:      {DEVICE_MAP}")
    print(f"Output:          {OUTPUT_DIR}")
    print(f"Format:          {FORMAT}")
    print("=" * 60)

    # Build sparse layer config (only exceptions)
    print("\n[1/2] Building sparse layer configuration...")
    layer_config = build_layer_config()
    print(f"  Config entries: {len(layer_config)} (vs 74,000+ with dense approach)")
    
    # Count by type
    attn_count = sum(1 for k in layer_config if "attn" in k)
    gate_count = sum(1 for k in layer_config if ".mlp.gate" in k)
    shared_gate_count = sum(1 for k in layer_config if "shared_expert_gate" in k)
    
    print(f"  Attention layers ({NON_EXPERT_BITS}-bit):      {attn_count}")
    print(f"  MLP gate ({NON_EXPERT_BITS}-bit):              {gate_count}")
    print(f"  Shared expert gate (16-bit):   {shared_gate_count}")
    print(f"  Everything else:               {EXPERT_BITS}-bit (experts, shared_experts, etc.)")

    # Run quantization
    # Key: pass model NAME as string, not loaded model object
    # AutoRound will load the model internally with proper device mapping
    print(f"\n[2/2] Running AutoRound quantization (iters={ITERS})...")
    if ITERS > 0:
        print("  GPU-accelerated mode: will use GPU for gradient tuning")
    else:
        print("  RTN mode: CPU-bound optimized rounding")
    print("  Note: AutoRound will load the model internally")
    
    autoround = AutoRound(
        MODEL_NAME,  # String, not model object - Intel's approach
        bits=EXPERT_BITS,  # Default bits for all layers
        group_size=GROUP_SIZE,
        sym=SYM,
        iters=ITERS,
        lr=LR,  # Learning rate for tuning
        layer_config=layer_config,  # Sparse config with exceptions only
        # GPU settings - use device_map (not device, which is deprecated)
        device_map=DEVICE_MAP,
        low_gpu_mem_usage=LOW_GPU_MEM,
        # CPU memory optimization
        low_cpu_mem_usage=True,
    )

    autoround.quantize_and_save(format=FORMAT, output_dir=OUTPUT_DIR)

    elapsed = time.time() - start_time
    hours = int(elapsed // 3600)
    minutes = int((elapsed % 3600) // 60)
    seconds = int(elapsed % 60)

    print(f"\n{'=' * 60}")
    print("Quantization complete!")
    print(f"  Output: {OUTPUT_DIR}")
    if hours > 0:
        print(f"  Time:   {hours}h {minutes}m {seconds}s")
    else:
        print(f"  Time:   {minutes}m {seconds}s")
    print("=" * 60)


if __name__ == "__main__":
    main()