| | |
| | """ |
| | Quantize Qwen3-Coder-Next (80B MoE) to GPTQ 4-bit using GPTQModel v5.7.0. |
| | |
| | Requires: |
| | - GPTQModel >= 5.7.0 (with Qwen3Next expert converter support) |
| | - ~228GB system RAM (160GB model + calibration data) |
| | - 1+ GPUs with >= 32GB VRAM |
| | |
| | Quantization strategy: |
| | - MoE experts (gate_proj, up_proj, down_proj): INT4 GPTQ, group_size=32 |
| | - Everything else (attention, linear_attn, shared_expert, norms, embeddings): FP16 |
| | - Mixed calibration: code (evol-codealpaca) + general text (C4) |
| | - 2048 samples with context length binning for uniform expert coverage |
| | - RTN failsafe for rare experts with insufficient calibration data |
| | """ |
| |
|
| | import os |
| | import sys |
| | import logging |
| | import random |
| |
|
| | import torch |
| | from gptqmodel import GPTQModel |
| | from gptqmodel.quantization import QuantizeConfig |
| | from datasets import load_dataset |
| | from transformers import AutoTokenizer |
| |
|
| | |
| | MODEL_ID = "Qwen/Qwen3-Coder-Next" |
| | OUTPUT_DIR = "./Qwen3-Coder-Next-GPTQ-4bit" |
| | NUM_CALIBRATION_SAMPLES = 2048 |
| | MAX_SEQ_LENGTH = 2048 |
| | BITS = 4 |
| | GROUP_SIZE = 32 |
| |
|
| | |
| | TOKEN_BINS = [(256, 512), (512, 1024), (1024, 1536), (1536, 2048)] |
| |
|
| | logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s") |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def prepare_calibration_dataset(tokenizer, num_samples, token_bins): |
| | """ |
| | Prepare mixed calibration dataset with uniform context length distribution. |
| | Loads code (evol-codealpaca) and general text (C4), bins by token count, |
| | and returns a uniform distribution across context length bins. |
| | """ |
| | logger.info(f"Target: {num_samples} samples across {len(token_bins)} bins") |
| |
|
| | binned_samples = {i: [] for i in range(len(token_bins))} |
| |
|
| | |
| | logger.info("Loading code dataset: theblackcat102/evol-codealpaca-v1") |
| | code_dataset = load_dataset("theblackcat102/evol-codealpaca-v1", split="train") |
| |
|
| | logger.info("Loading general text dataset: allenai/c4") |
| | c4_dataset = load_dataset( |
| | "allenai/c4", |
| | data_files="en/c4-train.00001-of-01024.json.gz", |
| | split="train" |
| | ) |
| |
|
| | def format_code_sample(sample): |
| | instruction = sample.get("instruction", "") |
| | output = sample.get("output", "") |
| | return f"### Instruction:\n{instruction}\n\n### Response:\n{output}" |
| |
|
| | |
| | all_samples = [] |
| |
|
| | for sample in code_dataset: |
| | text = format_code_sample(sample) |
| | if len(text) < 50: |
| | continue |
| | token_count = len(tokenizer.encode(text, add_special_tokens=False)) |
| | all_samples.append((text, token_count)) |
| |
|
| | for idx, sample in enumerate(c4_dataset): |
| | if idx >= 50000: |
| | break |
| | text = sample.get("text", "") |
| | if len(text) < 50: |
| | continue |
| | token_count = len(tokenizer.encode(text, add_special_tokens=False)) |
| | all_samples.append((text, token_count)) |
| |
|
| | logger.info(f"Total samples pool: {len(all_samples)}") |
| |
|
| | |
| | for text, token_count in all_samples: |
| | for bin_idx, (min_tok, max_tok) in enumerate(token_bins): |
| | if min_tok <= token_count < max_tok: |
| | binned_samples[bin_idx].append(text) |
| | break |
| |
|
| | |
| | random.shuffle(all_samples) |
| |
|
| | def create_chained_sample(min_tokens, max_tokens): |
| | chained_texts, current_tokens, attempts = [], 0, 0 |
| | while current_tokens < min_tokens and attempts < 50: |
| | text, tok_count = random.choice(all_samples) |
| | if current_tokens + tok_count > max_tokens: |
| | attempts += 1 |
| | continue |
| | chained_texts.append(text) |
| | current_tokens += tok_count |
| | attempts = 0 |
| | if min_tokens <= current_tokens < max_tokens: |
| | return "\n\n---\n\n".join(chained_texts) |
| | return None |
| |
|
| | samples_per_bin = num_samples // len(token_bins) |
| | for bin_idx, (min_tok, max_tok) in enumerate(token_bins): |
| | needed = samples_per_bin + 1 - len(binned_samples[bin_idx]) |
| | if needed > 0: |
| | for _ in range(needed * 20): |
| | chained = create_chained_sample(min_tok, max_tok) |
| | if chained: |
| | binned_samples[bin_idx].append(chained) |
| | needed -= 1 |
| | if needed <= 0: |
| | break |
| |
|
| | for bin_idx, (min_tok, max_tok) in enumerate(token_bins): |
| | logger.info(f" Bin {bin_idx} ({min_tok}-{max_tok} tokens): {len(binned_samples[bin_idx])} samples") |
| |
|
| | |
| | remainder = num_samples % len(token_bins) |
| | final_samples = [] |
| | for bin_idx in range(len(token_bins)): |
| | target = samples_per_bin + (1 if bin_idx < remainder else 0) |
| | bin_data = binned_samples[bin_idx] |
| | if len(bin_data) < target: |
| | final_samples.extend(bin_data) |
| | final_samples.extend(random.choices(bin_data, k=target - len(bin_data))) |
| | else: |
| | final_samples.extend(random.sample(bin_data, target)) |
| |
|
| | random.shuffle(final_samples) |
| | logger.info(f"Final calibration dataset: {len(final_samples)} samples") |
| | return final_samples |
| |
|
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True) |
| |
|
| | |
| | calibration_data = prepare_calibration_dataset(tokenizer, NUM_CALIBRATION_SAMPLES, TOKEN_BINS) |
| |
|
| | |
| | dynamic_exclusions = { |
| | "-:.*linear_attn\\.in_proj_qkvz": {}, |
| | "-:.*linear_attn\\.out_proj": {}, |
| | "-:.*shared_expert\\.gate_proj": {}, |
| | "-:.*shared_expert\\.up_proj": {}, |
| | "-:.*shared_expert\\.down_proj": {}, |
| | } |
| |
|
| | |
| | quant_config = QuantizeConfig( |
| | bits=BITS, |
| | group_size=GROUP_SIZE, |
| | sym=True, |
| | desc_act=False, |
| | true_sequential=True, |
| | offload_to_disk=False, |
| | lm_head=False, |
| | dynamic=dynamic_exclusions, |
| | ) |
| |
|
| | |
| | logger.info(f"Loading {MODEL_ID}...") |
| | model = GPTQModel.load(MODEL_ID, quant_config, trust_remote_code=True) |
| |
|
| | logger.info("Starting quantization...") |
| | model.quantize(calibration_data, batch_size=1) |
| |
|
| | logger.info(f"Saving to {OUTPUT_DIR}...") |
| | model.save(OUTPUT_DIR) |
| | tokenizer.save_pretrained(OUTPUT_DIR) |
| |
|
| | logger.info("Done!") |
| |
|