#!/usr/bin/env python3 """ Quantize Qwen3-Coder-Next (80B MoE) to GPTQ 4-bit using GPTQModel v5.7.0. Requires: - GPTQModel >= 5.7.0 (with Qwen3Next expert converter support) - ~228GB system RAM (160GB model + calibration data) - 1+ GPUs with >= 32GB VRAM Quantization strategy: - MoE experts (gate_proj, up_proj, down_proj): INT4 GPTQ, group_size=32 - Everything else (attention, linear_attn, shared_expert, norms, embeddings): FP16 - Mixed calibration: code (evol-codealpaca) + general text (C4) - 2048 samples with context length binning for uniform expert coverage - RTN failsafe for rare experts with insufficient calibration data """ import os import sys import logging import random import torch from gptqmodel import GPTQModel from gptqmodel.quantization import QuantizeConfig from datasets import load_dataset from transformers import AutoTokenizer # Configuration MODEL_ID = "Qwen/Qwen3-Coder-Next" OUTPUT_DIR = "./Qwen3-Coder-Next-GPTQ-4bit" NUM_CALIBRATION_SAMPLES = 2048 MAX_SEQ_LENGTH = 2048 BITS = 4 GROUP_SIZE = 32 # Context length bins for uniform distribution TOKEN_BINS = [(256, 512), (512, 1024), (1024, 1536), (1536, 2048)] logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s") logger = logging.getLogger(__name__) def prepare_calibration_dataset(tokenizer, num_samples, token_bins): """ Prepare mixed calibration dataset with uniform context length distribution. Loads code (evol-codealpaca) and general text (C4), bins by token count, and returns a uniform distribution across context length bins. """ logger.info(f"Target: {num_samples} samples across {len(token_bins)} bins") binned_samples = {i: [] for i in range(len(token_bins))} # Load datasets logger.info("Loading code dataset: theblackcat102/evol-codealpaca-v1") code_dataset = load_dataset("theblackcat102/evol-codealpaca-v1", split="train") logger.info("Loading general text dataset: allenai/c4") c4_dataset = load_dataset( "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split="train" ) def format_code_sample(sample): instruction = sample.get("instruction", "") output = sample.get("output", "") return f"### Instruction:\n{instruction}\n\n### Response:\n{output}" # Pre-tokenize all samples all_samples = [] for sample in code_dataset: text = format_code_sample(sample) if len(text) < 50: continue token_count = len(tokenizer.encode(text, add_special_tokens=False)) all_samples.append((text, token_count)) for idx, sample in enumerate(c4_dataset): if idx >= 50000: break text = sample.get("text", "") if len(text) < 50: continue token_count = len(tokenizer.encode(text, add_special_tokens=False)) all_samples.append((text, token_count)) logger.info(f"Total samples pool: {len(all_samples)}") # Bin samples by token length for text, token_count in all_samples: for bin_idx, (min_tok, max_tok) in enumerate(token_bins): if min_tok <= token_count < max_tok: binned_samples[bin_idx].append(text) break # Chain short samples for sparse long-context bins random.shuffle(all_samples) def create_chained_sample(min_tokens, max_tokens): chained_texts, current_tokens, attempts = [], 0, 0 while current_tokens < min_tokens and attempts < 50: text, tok_count = random.choice(all_samples) if current_tokens + tok_count > max_tokens: attempts += 1 continue chained_texts.append(text) current_tokens += tok_count attempts = 0 if min_tokens <= current_tokens < max_tokens: return "\n\n---\n\n".join(chained_texts) return None samples_per_bin = num_samples // len(token_bins) for bin_idx, (min_tok, max_tok) in enumerate(token_bins): needed = samples_per_bin + 1 - len(binned_samples[bin_idx]) if needed > 0: for _ in range(needed * 20): chained = create_chained_sample(min_tok, max_tok) if chained: binned_samples[bin_idx].append(chained) needed -= 1 if needed <= 0: break for bin_idx, (min_tok, max_tok) in enumerate(token_bins): logger.info(f" Bin {bin_idx} ({min_tok}-{max_tok} tokens): {len(binned_samples[bin_idx])} samples") # Sample uniformly from each bin remainder = num_samples % len(token_bins) final_samples = [] for bin_idx in range(len(token_bins)): target = samples_per_bin + (1 if bin_idx < remainder else 0) bin_data = binned_samples[bin_idx] if len(bin_data) < target: final_samples.extend(bin_data) final_samples.extend(random.choices(bin_data, k=target - len(bin_data))) else: final_samples.extend(random.sample(bin_data, target)) random.shuffle(final_samples) logger.info(f"Final calibration dataset: {len(final_samples)} samples") return final_samples # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True) # Prepare calibration data calibration_data = prepare_calibration_dataset(tokenizer, NUM_CALIBRATION_SAMPLES, TOKEN_BINS) # Dynamic exclusions: keep these at full precision (FP16) dynamic_exclusions = { "-:.*linear_attn\\.in_proj_qkvz": {}, "-:.*linear_attn\\.out_proj": {}, "-:.*shared_expert\\.gate_proj": {}, "-:.*shared_expert\\.up_proj": {}, "-:.*shared_expert\\.down_proj": {}, } # Create quantization config quant_config = QuantizeConfig( bits=BITS, group_size=GROUP_SIZE, sym=True, desc_act=False, true_sequential=True, offload_to_disk=False, lm_head=False, dynamic=dynamic_exclusions, ) # Load and quantize logger.info(f"Loading {MODEL_ID}...") model = GPTQModel.load(MODEL_ID, quant_config, trust_remote_code=True) logger.info("Starting quantization...") model.quantize(calibration_data, batch_size=1) logger.info(f"Saving to {OUTPUT_DIR}...") model.save(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) logger.info("Done!")