Add quantized model files

a87cd2c verified 22 days ago

6.02 kB

	#!/usr/bin/env python3
	"""
	AutoRound Mixed-Bits Quantization Script (GPU Accelerated)
	==========================================================
	Generated by Ansible for: Qwen/Qwen3-Coder-Next

	Strategy (Intel's MoE recipe):
	- Default: 4-bit for all layers (including experts)
	- Exceptions: 8-bit for attention, gate
	- shared_expert_gate: 16-bit (shape not divisible by 32)
	- lm_head: original precision (excluded by AutoRound)

	GPU Acceleration:
	- iters=50 enables gradient-based tuning on GPU
	- For iters>0, lr is auto-set to 1.0/iters (or use lr=5e-3 for iters=50)
	- low_gpu_mem_usage=True offloads intermediates to CPU

	Reference: https://huggingface.co/Intel/Qwen3-Next-80B-A3B-Thinking-int4-mixed-AutoRound
	"""
	import sys
	import time
	from auto_round import AutoRound

	# Configuration (injected by Ansible)
	MODEL_NAME = "Qwen/Qwen3-Coder-Next"
	OUTPUT_DIR = "/var/lib/inference/models/Qwen3-Coder-Next-int4-mixed-AutoRound"
	FORMAT = "auto_round"
	EXPERT_BITS = 4
	NON_EXPERT_BITS = 8
	GROUP_SIZE = 128
	SYM = True
	ITERS = 50
	LOW_GPU_MEM = True
	DEVICE_MAP = "0,1,2"

	# Learning rate: use 5e-3 for iters=50, None (auto) for iters>=200
	LR = 5e-3
	# Qwen3-Next has 48 layers with mixed attention types:
	# - Layers 3,7,11,15,19,23,27,31,35,39,43,47: self_attn (q_proj, k_proj, v_proj, o_proj)
	# - Other layers: linear_attn (in_proj_qkvz, in_proj_ba, out_proj)
	# All layers have: mlp.gate, mlp.shared_expert_gate, mlp.shared_experts., mlp.experts.

	SELF_ATTN_LAYERS = [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47]
	NUM_LAYERS = 48


	def build_layer_config():
	"""
	Build sparse layer_config with only the 8-bit exceptions.

	Intel's approach: default is 4-bit, only specify layers that need higher precision.
	This results in ~240 config entries instead of 74,000+.

	From Intel's quantization_config.json:
	- Attention layers (self_attn or linear_attn) -> 8-bit
	- mlp.gate (router) -> 8-bit
	- mlp.shared_expert_gate -> 16-bit (shape not divisible by 32)
	- Everything else (experts, shared_experts) -> 4-bit default
	"""
	layer_config = {}

	for i in range(NUM_LAYERS):
	prefix = f"model.layers.{i}"

	# Attention layers -> 8-bit
	if i in SELF_ATTN_LAYERS:
	# Standard self-attention
	for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
	layer_config[f"{prefix}.self_attn.{proj}"] = {"bits": NON_EXPERT_BITS}
	else:
	# Linear attention
	for proj in ["in_proj_qkvz", "in_proj_ba", "out_proj"]:
	layer_config[f"{prefix}.linear_attn.{proj}"] = {"bits": NON_EXPERT_BITS}

	# MLP gate -> 8-bit (router)
	layer_config[f"{prefix}.mlp.gate"] = {"bits": NON_EXPERT_BITS}

	# shared_expert_gate -> 16-bit (shape not divisible by 32, will be skipped by AutoRound)
	layer_config[f"{prefix}.mlp.shared_expert_gate"] = {"bits": 16}

	return layer_config


	def main():
	start_time = time.time()

	mode = "GPU-accelerated tuning" if ITERS > 0 else "RTN (CPU-bound)"

	print("=" * 60)
	print(f"AutoRound Mixed-Bits Quantization ({mode})")
	print("=" * 60)
	print(f"Model: {MODEL_NAME}")
	print(f"Default bits: {EXPERT_BITS} (experts)")
	print(f"Exception bits: {NON_EXPERT_BITS} (attention, gate)")
	print(f"Group size: {GROUP_SIZE}")
	print(f"Symmetric: {SYM}")
	print(f"Iterations: {ITERS}")
	print(f"Learning rate: {LR}")
	print(f"Low GPU mem: {LOW_GPU_MEM}")
	print(f"Device map: {DEVICE_MAP}")
	print(f"Output: {OUTPUT_DIR}")
	print(f"Format: {FORMAT}")
	print("=" * 60)

	# Build sparse layer config (only exceptions)
	print("\n[1/2] Building sparse layer configuration...")
	layer_config = build_layer_config()
	print(f" Config entries: {len(layer_config)} (vs 74,000+ with dense approach)")

	# Count by type
	attn_count = sum(1 for k in layer_config if "attn" in k)
	gate_count = sum(1 for k in layer_config if ".mlp.gate" in k)
	shared_gate_count = sum(1 for k in layer_config if "shared_expert_gate" in k)

	print(f" Attention layers ({NON_EXPERT_BITS}-bit): {attn_count}")
	print(f" MLP gate ({NON_EXPERT_BITS}-bit): {gate_count}")
	print(f" Shared expert gate (16-bit): {shared_gate_count}")
	print(f" Everything else: {EXPERT_BITS}-bit (experts, shared_experts, etc.)")

	# Run quantization
	# Key: pass model NAME as string, not loaded model object
	# AutoRound will load the model internally with proper device mapping
	print(f"\n[2/2] Running AutoRound quantization (iters={ITERS})...")
	if ITERS > 0:
	print(" GPU-accelerated mode: will use GPU for gradient tuning")
	else:
	print(" RTN mode: CPU-bound optimized rounding")
	print(" Note: AutoRound will load the model internally")

	autoround = AutoRound(
	MODEL_NAME, # String, not model object - Intel's approach
	bits=EXPERT_BITS, # Default bits for all layers
	group_size=GROUP_SIZE,
	sym=SYM,
	iters=ITERS,
	lr=LR, # Learning rate for tuning
	layer_config=layer_config, # Sparse config with exceptions only
	# GPU settings - use device_map (not device, which is deprecated)
	device_map=DEVICE_MAP,
	low_gpu_mem_usage=LOW_GPU_MEM,
	# CPU memory optimization
	low_cpu_mem_usage=True,
	)

	autoround.quantize_and_save(format=FORMAT, output_dir=OUTPUT_DIR)

	elapsed = time.time() - start_time
	hours = int(elapsed // 3600)
	minutes = int((elapsed % 3600) // 60)
	seconds = int(elapsed % 60)

	print(f"\n{'=' * 60}")
	print("Quantization complete!")
	print(f" Output: {OUTPUT_DIR}")
	if hours > 0:
	print(f" Time: {hours}h {minutes}m {seconds}s")
	else:
	print(f" Time: {minutes}m {seconds}s")
	print("=" * 60)


	if __name__ == "__main__":
	main()