Recommendationconfiguration for quantizing to AWQ ? Seems hard.

#2
by chrisoutwright - opened

Any Guides how to quanitize this with AWQ?:

I tried:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
MoE-safe AWQ quantization script (llm-compressor) tuned for Qwen3 MoE style models.

Key principles:
- DO NOT quantize routing / MoE router gate layers (model.layers.*.mlp.gate.weight)
- DO NOT quantize norms
- Keep embeddings + lm_head in FP16/BF16
- Quantize experts (gate_proj/up_proj/down_proj) together (required for vLLM fused MoE)
- Optionally quantize attention projections
- Use enough calibration samples (>= 512 recommended)
- Avoid LLaMA-style smoothing mappings across router paths (mappings = [])

Outputs a HuggingFace-compatible folder with compressed-tensors weights.
"""

import os
import sys
import subprocess
import gc
import time
import traceback
from pathlib import Path
from datetime import datetime, timezone

# =====================================================
# ENV
# =====================================================
os.environ["PYTHONUNBUFFERED"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# If you REALLY want to prevent HF caching inside a container, keep these.
# (But note: it can slow repeated runs.)
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["HF_DATASETS_CACHE"] = "/dev/null"
os.environ["TRANSFORMERS_CACHE"] = "/dev/null"
os.environ["HF_HOME"] = "/dev/null"
os.environ["XDG_CACHE_HOME"] = "/dev/null"

# =====================================================
# Shell helpers
# =====================================================
def run_cmd(cmd: list[str]) -> str:
    try:
        return subprocess.check_output(cmd, encoding="utf-8", stderr=subprocess.STDOUT)
    except Exception as e:
        return f"<command failed: {' '.join(cmd)}>\n{e}\n"

def read_text_file(path: str) -> str:
    try:
        with open(path, "r", encoding="utf-8", errors="replace") as f:
            return f.read().strip()
    except Exception:
        return ""

def docker_memory_info() -> str:
    lines = []
    for p in [
        "/sys/fs/cgroup/memory.max",
        "/sys/fs/cgroup/memory.current",
        "/sys/fs/cgroup/memory/memory.limit_in_bytes",
        "/sys/fs/cgroup/memory/memory.usage_in_bytes",
    ]:
        if os.path.exists(p):
            lines.append(f"{p}: {read_text_file(p)}")
    return "\n".join(lines) if lines else "No cgroup memory files found."

# =====================================================
# GPU selection (prefer 4090)
# =====================================================
def select_best_gpu_prefer_4090() -> None:
    out = run_cmd(["nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader"])
    if out.startswith("<command failed"):
        print(out, file=sys.stderr)
        return

    gpus = []
    for line in out.strip().splitlines():
        parts = [p.strip() for p in line.split(",")]
        if len(parts) != 3:
            continue
        idx, name, mem = parts
        mem_gb = float(mem.replace("MiB", "")) / 1024.0
        gpus.append((int(idx), name, mem_gb))

    if not gpus:
        return

    for idx, name, mem in gpus:
        if "4090" in name:
            os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
            os.environ["CUDA_VISIBLE_DEVICES"] = str(idx)
            print(f"βœ“ Selected RTX 4090 (GPU {idx}, {mem:.1f} GB)")
            return

    idx, name, mem = max(gpus, key=lambda x: x[2])
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = str(idx)
    print(f"βœ“ Using best GPU: {name} (GPU {idx}, {mem:.1f} GB)")

select_best_gpu_prefer_4090()

# =====================================================
# Imports (after CUDA_VISIBLE_DEVICES)
# =====================================================
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from accelerate import init_empty_weights, infer_auto_device_map

from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.entrypoints.oneshot import oneshot

# =====================================================
# Paths
# =====================================================
MODEL_DIR = "/workspace/models/input"
OUT_BASE = "/workspace/models/output"
PROMPTS_FILE = "/workspace/calib/calib_128.txt"

OUT_DIR = os.path.join(OUT_BASE, "final_awq_moe_safe")
CRASH_DIR = os.path.join(OUT_BASE, "crash_reports")

Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
Path(CRASH_DIR).mkdir(parents=True, exist_ok=True)

# =====================================================
# Config
# =====================================================
# Calibration
MAX_SEQ_LEN = 2048
CALIB_SAMPLES = 512

# Quant scheme
NUM_BITS = 4
GROUP_SIZE = 128
SYMMETRIC = True

# What to quantize:
# - "experts_only": safest for MoE
# - "experts_plus_attention": adds attention projections
QUANT_MODE = os.environ.get("QUANT_MODE", "experts_only").strip().lower()

# Saving
SAVE_COMPRESSED_FINAL = True
SAVE_BACKUP_COPY = True

# Memory / device-map controls
GPU_MAX_MEM = "28GiB"
CPU_MAX_MEM = "160GiB"

# =====================================================
# Logging / diagnostics
# =====================================================
def die(msg: str) -> None:
    print(f"❌ {msg}", file=sys.stderr)
    sys.exit(1)

def log_gpu(tag: str) -> None:
    if not torch.cuda.is_available():
        print(f"[{tag}] CUDA not available")
        return
    mem = torch.cuda.memory_allocated() / 1024**3
    peak = torch.cuda.max_memory_allocated() / 1024**3
    print(f"[{tag}] GPU mem: {mem:.2f} GB | peak: {peak:.2f} GB")

def write_crash_report(title: str, err: BaseException) -> str:
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    path = os.path.join(CRASH_DIR, f"crash_{ts}.log")
    tb = traceback.format_exc()
    report = [
        "=" * 120, f"{title} @ {ts} UTC", "=" * 120,
        "\n--- Exception ---\n", repr(err),
        "\n--- Traceback ---\n", tb,
        "\n--- nvidia-smi ---\n", run_cmd(["nvidia-smi"]),
        "\n--- torch.cuda.memory_summary ---\n",
    ]
    try:
        if torch.cuda.is_available():
            report.append(torch.cuda.memory_summary(device=0, abbreviated=False))
        else:
            report.append("CUDA not available.")
    except Exception as e:
        report.append(f"<torch.cuda.memory_summary failed>\n{e}\n")

    report.extend([
        "\n--- free -h ---\n", run_cmd(["bash", "-lc", "free -h || true"]),
        "\n--- ulimit -a ---\n", run_cmd(["bash", "-lc", "ulimit -a || true"]),
        "\n--- Docker / cgroup memory ---\n", docker_memory_info(),
    ])

    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(report) + "\n")
    return path

# =====================================================
# Sanity
# =====================================================
if not os.path.exists(MODEL_DIR):
    die(f"MODEL_DIR not found: {MODEL_DIR}")
if not os.path.exists(PROMPTS_FILE):
    die(f"Missing calibration file: {PROMPTS_FILE}")

if not torch.cuda.is_available():
    die("CUDA not available")
if torch.cuda.device_count() < 1:
    die(f"Expected at least one visible GPU, got {torch.cuda.device_count()}")

props = torch.cuda.get_device_properties(0)
print(f"βœ“ Using GPU: {props.name} | {props.total_memory/1024**3:.1f} GB")
print(f"βœ“ QUANT_MODE = {QUANT_MODE}")
print(f"βœ“ bits={NUM_BITS} group_size={GROUP_SIZE} symmetric={SYMMETRIC}")
print(f"βœ“ calib_samples={CALIB_SAMPLES} max_seq_len={MAX_SEQ_LEN}")

# =====================================================
# Calibration prompts (RS-delimited)
# =====================================================
SEP = "\x1e"  # ASCII Record Separator

with open(PROMPTS_FILE, "r", encoding="utf-8") as f:
    raw = f.read()

items = [x.strip() for x in raw.split(SEP) if x.strip()]
if not items:
    die("No calibration items found (RS-delimited file empty?)")

calib_data = items[:CALIB_SAMPLES]
calib_ds = Dataset.from_dict({"text": calib_data})

# =====================================================
# Load model/tokenizer
# =====================================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
print("Loading model with CPU offload-friendly device_map...")
gc.disable()

config = AutoConfig.from_pretrained(MODEL_DIR, trust_remote_code=True)
with init_empty_weights():
    tmp_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

device_map = infer_auto_device_map(
    tmp_model,
    max_memory={0: GPU_MAX_MEM, "cpu": CPU_MAX_MEM},
    no_split_module_classes=["Qwen3MoeBlock"],
)

del tmp_model
gc.collect()

model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    torch_dtype=torch.float16,
    device_map=device_map,
    trust_remote_code=True,
).eval()

gc.enable()

# =====================================================
# AWQ targets / ignore patterns (MoE-safe for THIS Qwen3 MoE naming)
# =====================================================

# Expert projections MUST be quantized together (gate_proj/up_proj/down_proj).
# Scope them to mlp.experts.<id>.* to avoid catching router gate or other "gate*" stuff.
EXPERT_GATE = r"re:.*mlp\.experts\.\d+\.gate_proj$"
EXPERT_UP   = r"re:.*mlp\.experts\.\d+\.up_proj$"
EXPERT_DOWN = r"re:.*mlp\.experts\.\d+\.down_proj$"

# Attention projections (optional) – scope to self_attn to avoid accidental matches.
ATTN_Q = r"re:.*self_attn\.q_proj$"
ATTN_K = r"re:.*self_attn\.k_proj$"
ATTN_V = r"re:.*self_attn\.v_proj$"
ATTN_O = r"re:.*self_attn\.o_proj$"

if QUANT_MODE == "experts_only":
    targets = [EXPERT_GATE, EXPERT_UP, EXPERT_DOWN]
elif QUANT_MODE == "experts_plus_attention":
    targets = [ATTN_Q, ATTN_K, ATTN_V, ATTN_O, EXPERT_GATE, EXPERT_UP, EXPERT_DOWN]
else:
    die("Invalid QUANT_MODE. Use 'experts_only' or 'experts_plus_attention'.")

# Ignore router gate + norms + embeddings/head + (optional) shared experts
ignore = [
    # Output/embeddings
    "lm_head",
    "model.embed_tokens",
    "re:.*embed_tokens.*",

    # Norms (never quantize)
    "re:.*norm.*",
    "re:.*RMSNorm.*",
    "re:.*layernorm.*",
    "re:.*input_layernorm$",
    "re:.*post_attention_layernorm$",

    # Router gate (MoE routing) – KEEP FP16
    # Your weights show: model.layers.X.mlp.gate.weight  (shape [128, 2048])
    "re:.*mlp\\.gate\\.weight$",

    # Some models have q_norm/k_norm modules
    "re:.*self_attn\\.q_norm.*",
    "re:.*self_attn\\.k_norm.*",
    "re:.*q_norm.*",
    "re:.*k_norm.*",

    # Shared experts (if present; safe to keep fp16)
    "re:.*shared_expert.*",
    "re:.*shared_experts.*",

    # Other router-ish naming (defensive; should not match expert gate_proj now anyway)
    "re:.*router.*",
    "re:.*routing.*",
    "re:.*moe.*gate.*",
]

recipe = [
    AWQModifier(
        ignore=ignore,
        mappings=[],  # CRITICAL: prevent auto-inference / smoothing for MoE
        config_groups={
            "group_0": {
                "targets": targets,
                "weights": {
                    "num_bits": NUM_BITS,
                    "type": "int",
                    "symmetric": SYMMETRIC,
                    "group_size": GROUP_SIZE,
                    "strategy": "group",
                    "dynamic": False,
                },
                "input_activations": None,
                "output_activations": None,
                "format": None,
            }
        },
    )
]

# =====================================================
# Run AWQ
# =====================================================
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
log_gpu("before_awq")

try:
    start = time.time()
    model = oneshot(
        model=model,
        tokenizer=tokenizer,
        dataset=calib_ds,
        recipe=recipe,
        num_calibration_samples=len(calib_data),
        text_column="text",
        max_seq_length=MAX_SEQ_LEN,
    )
    print(f"βœ“ AWQ finished in {time.time() - start:.1f}s")
    log_gpu("after_awq")
except torch.cuda.OutOfMemoryError as e:
    report_path = write_crash_report("CUDA OOM during AWQ", e)
    print(f"🧾 Crash report written to: {report_path}", file=sys.stderr)
    sys.exit(137)
except Exception as e:
    report_path = write_crash_report("Unexpected error during AWQ", e)
    print(f"🧾 Crash report written to: {report_path}", file=sys.stderr)
    sys.exit(1)

# =====================================================
# Save final
# =====================================================
print(f"Saving final AWQ model β†’ {OUT_DIR}")
model.save_pretrained(OUT_DIR, safe_serialization=True, save_compressed=SAVE_COMPRESSED_FINAL)
tokenizer.save_pretrained(OUT_DIR)
print("βœ“ Final model saved.")

if SAVE_BACKUP_COPY:
    backup_dir = OUT_DIR + "_backup"
    Path(backup_dir).mkdir(parents=True, exist_ok=True)
    print(f"Saving backup copy β†’ {backup_dir}")
    model.save_pretrained(backup_dir, safe_serialization=True, save_compressed=SAVE_COMPRESSED_FINAL)
    tokenizer.save_pretrained(backup_dir)
    print("βœ“ Backup saved.")

print("\nβœ“ DONE")

But this will get me rather some rambling results in vllm ( I used as calibration the Tesslate/Rust_Dataset 512 items, 2048 seq len)
Firstr it starts well (Explaining a jinja template):
image

and then goes off the rails FAST:

image

Any tips to get it to AWQ?

Sign up or log in to comment