raydelossantos's picture
Add quantized model files
a87cd2c verified
#!/usr/bin/env python3
"""
AutoRound Mixed-Bits Quantization Script (GPU Accelerated)
==========================================================
Generated by Ansible for: Qwen/Qwen3-Coder-Next
Strategy (Intel's MoE recipe):
- Default: 4-bit for all layers (including experts)
- Exceptions: 8-bit for attention, gate
- shared_expert_gate: 16-bit (shape not divisible by 32)
- lm_head: original precision (excluded by AutoRound)
GPU Acceleration:
- iters=50 enables gradient-based tuning on GPU
- For iters>0, lr is auto-set to 1.0/iters (or use lr=5e-3 for iters=50)
- low_gpu_mem_usage=True offloads intermediates to CPU
Reference: https://huggingface.co/Intel/Qwen3-Next-80B-A3B-Thinking-int4-mixed-AutoRound
"""
import sys
import time
from auto_round import AutoRound
# Configuration (injected by Ansible)
MODEL_NAME = "Qwen/Qwen3-Coder-Next"
OUTPUT_DIR = "/var/lib/inference/models/Qwen3-Coder-Next-int4-mixed-AutoRound"
FORMAT = "auto_round"
EXPERT_BITS = 4
NON_EXPERT_BITS = 8
GROUP_SIZE = 128
SYM = True
ITERS = 50
LOW_GPU_MEM = True
DEVICE_MAP = "0,1,2"
# Learning rate: use 5e-3 for iters=50, None (auto) for iters>=200
LR = 5e-3
# Qwen3-Next has 48 layers with mixed attention types:
# - Layers 3,7,11,15,19,23,27,31,35,39,43,47: self_attn (q_proj, k_proj, v_proj, o_proj)
# - Other layers: linear_attn (in_proj_qkvz, in_proj_ba, out_proj)
# All layers have: mlp.gate, mlp.shared_expert_gate, mlp.shared_experts.*, mlp.experts.*
SELF_ATTN_LAYERS = [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47]
NUM_LAYERS = 48
def build_layer_config():
"""
Build sparse layer_config with only the 8-bit exceptions.
Intel's approach: default is 4-bit, only specify layers that need higher precision.
This results in ~240 config entries instead of 74,000+.
From Intel's quantization_config.json:
- Attention layers (self_attn or linear_attn) -> 8-bit
- mlp.gate (router) -> 8-bit
- mlp.shared_expert_gate -> 16-bit (shape not divisible by 32)
- Everything else (experts, shared_experts) -> 4-bit default
"""
layer_config = {}
for i in range(NUM_LAYERS):
prefix = f"model.layers.{i}"
# Attention layers -> 8-bit
if i in SELF_ATTN_LAYERS:
# Standard self-attention
for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
layer_config[f"{prefix}.self_attn.{proj}"] = {"bits": NON_EXPERT_BITS}
else:
# Linear attention
for proj in ["in_proj_qkvz", "in_proj_ba", "out_proj"]:
layer_config[f"{prefix}.linear_attn.{proj}"] = {"bits": NON_EXPERT_BITS}
# MLP gate -> 8-bit (router)
layer_config[f"{prefix}.mlp.gate"] = {"bits": NON_EXPERT_BITS}
# shared_expert_gate -> 16-bit (shape not divisible by 32, will be skipped by AutoRound)
layer_config[f"{prefix}.mlp.shared_expert_gate"] = {"bits": 16}
return layer_config
def main():
start_time = time.time()
mode = "GPU-accelerated tuning" if ITERS > 0 else "RTN (CPU-bound)"
print("=" * 60)
print(f"AutoRound Mixed-Bits Quantization ({mode})")
print("=" * 60)
print(f"Model: {MODEL_NAME}")
print(f"Default bits: {EXPERT_BITS} (experts)")
print(f"Exception bits: {NON_EXPERT_BITS} (attention, gate)")
print(f"Group size: {GROUP_SIZE}")
print(f"Symmetric: {SYM}")
print(f"Iterations: {ITERS}")
print(f"Learning rate: {LR}")
print(f"Low GPU mem: {LOW_GPU_MEM}")
print(f"Device map: {DEVICE_MAP}")
print(f"Output: {OUTPUT_DIR}")
print(f"Format: {FORMAT}")
print("=" * 60)
# Build sparse layer config (only exceptions)
print("\n[1/2] Building sparse layer configuration...")
layer_config = build_layer_config()
print(f" Config entries: {len(layer_config)} (vs 74,000+ with dense approach)")
# Count by type
attn_count = sum(1 for k in layer_config if "attn" in k)
gate_count = sum(1 for k in layer_config if ".mlp.gate" in k)
shared_gate_count = sum(1 for k in layer_config if "shared_expert_gate" in k)
print(f" Attention layers ({NON_EXPERT_BITS}-bit): {attn_count}")
print(f" MLP gate ({NON_EXPERT_BITS}-bit): {gate_count}")
print(f" Shared expert gate (16-bit): {shared_gate_count}")
print(f" Everything else: {EXPERT_BITS}-bit (experts, shared_experts, etc.)")
# Run quantization
# Key: pass model NAME as string, not loaded model object
# AutoRound will load the model internally with proper device mapping
print(f"\n[2/2] Running AutoRound quantization (iters={ITERS})...")
if ITERS > 0:
print(" GPU-accelerated mode: will use GPU for gradient tuning")
else:
print(" RTN mode: CPU-bound optimized rounding")
print(" Note: AutoRound will load the model internally")
autoround = AutoRound(
MODEL_NAME, # String, not model object - Intel's approach
bits=EXPERT_BITS, # Default bits for all layers
group_size=GROUP_SIZE,
sym=SYM,
iters=ITERS,
lr=LR, # Learning rate for tuning
layer_config=layer_config, # Sparse config with exceptions only
# GPU settings - use device_map (not device, which is deprecated)
device_map=DEVICE_MAP,
low_gpu_mem_usage=LOW_GPU_MEM,
# CPU memory optimization
low_cpu_mem_usage=True,
)
autoround.quantize_and_save(format=FORMAT, output_dir=OUTPUT_DIR)
elapsed = time.time() - start_time
hours = int(elapsed // 3600)
minutes = int((elapsed % 3600) // 60)
seconds = int(elapsed % 60)
print(f"\n{'=' * 60}")
print("Quantization complete!")
print(f" Output: {OUTPUT_DIR}")
if hours > 0:
print(f" Time: {hours}h {minutes}m {seconds}s")
else:
print(f" Time: {minutes}m {seconds}s")
print("=" * 60)
if __name__ == "__main__":
main()