Recommendationconfiguration for quantizing to AWQ ? Seems hard.
#2
by
chrisoutwright - opened
Any Guides how to quanitize this with AWQ?:
I tried:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
MoE-safe AWQ quantization script (llm-compressor) tuned for Qwen3 MoE style models.
Key principles:
- DO NOT quantize routing / MoE router gate layers (model.layers.*.mlp.gate.weight)
- DO NOT quantize norms
- Keep embeddings + lm_head in FP16/BF16
- Quantize experts (gate_proj/up_proj/down_proj) together (required for vLLM fused MoE)
- Optionally quantize attention projections
- Use enough calibration samples (>= 512 recommended)
- Avoid LLaMA-style smoothing mappings across router paths (mappings = [])
Outputs a HuggingFace-compatible folder with compressed-tensors weights.
"""
import os
import sys
import subprocess
import gc
import time
import traceback
from pathlib import Path
from datetime import datetime, timezone
# =====================================================
# ENV
# =====================================================
os.environ["PYTHONUNBUFFERED"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# If you REALLY want to prevent HF caching inside a container, keep these.
# (But note: it can slow repeated runs.)
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["HF_DATASETS_CACHE"] = "/dev/null"
os.environ["TRANSFORMERS_CACHE"] = "/dev/null"
os.environ["HF_HOME"] = "/dev/null"
os.environ["XDG_CACHE_HOME"] = "/dev/null"
# =====================================================
# Shell helpers
# =====================================================
def run_cmd(cmd: list[str]) -> str:
try:
return subprocess.check_output(cmd, encoding="utf-8", stderr=subprocess.STDOUT)
except Exception as e:
return f"<command failed: {' '.join(cmd)}>\n{e}\n"
def read_text_file(path: str) -> str:
try:
with open(path, "r", encoding="utf-8", errors="replace") as f:
return f.read().strip()
except Exception:
return ""
def docker_memory_info() -> str:
lines = []
for p in [
"/sys/fs/cgroup/memory.max",
"/sys/fs/cgroup/memory.current",
"/sys/fs/cgroup/memory/memory.limit_in_bytes",
"/sys/fs/cgroup/memory/memory.usage_in_bytes",
]:
if os.path.exists(p):
lines.append(f"{p}: {read_text_file(p)}")
return "\n".join(lines) if lines else "No cgroup memory files found."
# =====================================================
# GPU selection (prefer 4090)
# =====================================================
def select_best_gpu_prefer_4090() -> None:
out = run_cmd(["nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader"])
if out.startswith("<command failed"):
print(out, file=sys.stderr)
return
gpus = []
for line in out.strip().splitlines():
parts = [p.strip() for p in line.split(",")]
if len(parts) != 3:
continue
idx, name, mem = parts
mem_gb = float(mem.replace("MiB", "")) / 1024.0
gpus.append((int(idx), name, mem_gb))
if not gpus:
return
for idx, name, mem in gpus:
if "4090" in name:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(idx)
print(f"β Selected RTX 4090 (GPU {idx}, {mem:.1f} GB)")
return
idx, name, mem = max(gpus, key=lambda x: x[2])
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(idx)
print(f"β Using best GPU: {name} (GPU {idx}, {mem:.1f} GB)")
select_best_gpu_prefer_4090()
# =====================================================
# Imports (after CUDA_VISIBLE_DEVICES)
# =====================================================
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from accelerate import init_empty_weights, infer_auto_device_map
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.entrypoints.oneshot import oneshot
# =====================================================
# Paths
# =====================================================
MODEL_DIR = "/workspace/models/input"
OUT_BASE = "/workspace/models/output"
PROMPTS_FILE = "/workspace/calib/calib_128.txt"
OUT_DIR = os.path.join(OUT_BASE, "final_awq_moe_safe")
CRASH_DIR = os.path.join(OUT_BASE, "crash_reports")
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
Path(CRASH_DIR).mkdir(parents=True, exist_ok=True)
# =====================================================
# Config
# =====================================================
# Calibration
MAX_SEQ_LEN = 2048
CALIB_SAMPLES = 512
# Quant scheme
NUM_BITS = 4
GROUP_SIZE = 128
SYMMETRIC = True
# What to quantize:
# - "experts_only": safest for MoE
# - "experts_plus_attention": adds attention projections
QUANT_MODE = os.environ.get("QUANT_MODE", "experts_only").strip().lower()
# Saving
SAVE_COMPRESSED_FINAL = True
SAVE_BACKUP_COPY = True
# Memory / device-map controls
GPU_MAX_MEM = "28GiB"
CPU_MAX_MEM = "160GiB"
# =====================================================
# Logging / diagnostics
# =====================================================
def die(msg: str) -> None:
print(f"β {msg}", file=sys.stderr)
sys.exit(1)
def log_gpu(tag: str) -> None:
if not torch.cuda.is_available():
print(f"[{tag}] CUDA not available")
return
mem = torch.cuda.memory_allocated() / 1024**3
peak = torch.cuda.max_memory_allocated() / 1024**3
print(f"[{tag}] GPU mem: {mem:.2f} GB | peak: {peak:.2f} GB")
def write_crash_report(title: str, err: BaseException) -> str:
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
path = os.path.join(CRASH_DIR, f"crash_{ts}.log")
tb = traceback.format_exc()
report = [
"=" * 120, f"{title} @ {ts} UTC", "=" * 120,
"\n--- Exception ---\n", repr(err),
"\n--- Traceback ---\n", tb,
"\n--- nvidia-smi ---\n", run_cmd(["nvidia-smi"]),
"\n--- torch.cuda.memory_summary ---\n",
]
try:
if torch.cuda.is_available():
report.append(torch.cuda.memory_summary(device=0, abbreviated=False))
else:
report.append("CUDA not available.")
except Exception as e:
report.append(f"<torch.cuda.memory_summary failed>\n{e}\n")
report.extend([
"\n--- free -h ---\n", run_cmd(["bash", "-lc", "free -h || true"]),
"\n--- ulimit -a ---\n", run_cmd(["bash", "-lc", "ulimit -a || true"]),
"\n--- Docker / cgroup memory ---\n", docker_memory_info(),
])
with open(path, "w", encoding="utf-8") as f:
f.write("\n".join(report) + "\n")
return path
# =====================================================
# Sanity
# =====================================================
if not os.path.exists(MODEL_DIR):
die(f"MODEL_DIR not found: {MODEL_DIR}")
if not os.path.exists(PROMPTS_FILE):
die(f"Missing calibration file: {PROMPTS_FILE}")
if not torch.cuda.is_available():
die("CUDA not available")
if torch.cuda.device_count() < 1:
die(f"Expected at least one visible GPU, got {torch.cuda.device_count()}")
props = torch.cuda.get_device_properties(0)
print(f"β Using GPU: {props.name} | {props.total_memory/1024**3:.1f} GB")
print(f"β QUANT_MODE = {QUANT_MODE}")
print(f"β bits={NUM_BITS} group_size={GROUP_SIZE} symmetric={SYMMETRIC}")
print(f"β calib_samples={CALIB_SAMPLES} max_seq_len={MAX_SEQ_LEN}")
# =====================================================
# Calibration prompts (RS-delimited)
# =====================================================
SEP = "\x1e" # ASCII Record Separator
with open(PROMPTS_FILE, "r", encoding="utf-8") as f:
raw = f.read()
items = [x.strip() for x in raw.split(SEP) if x.strip()]
if not items:
die("No calibration items found (RS-delimited file empty?)")
calib_data = items[:CALIB_SAMPLES]
calib_ds = Dataset.from_dict({"text": calib_data})
# =====================================================
# Load model/tokenizer
# =====================================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
print("Loading model with CPU offload-friendly device_map...")
gc.disable()
config = AutoConfig.from_pretrained(MODEL_DIR, trust_remote_code=True)
with init_empty_weights():
tmp_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
device_map = infer_auto_device_map(
tmp_model,
max_memory={0: GPU_MAX_MEM, "cpu": CPU_MAX_MEM},
no_split_module_classes=["Qwen3MoeBlock"],
)
del tmp_model
gc.collect()
model = AutoModelForCausalLM.from_pretrained(
MODEL_DIR,
torch_dtype=torch.float16,
device_map=device_map,
trust_remote_code=True,
).eval()
gc.enable()
# =====================================================
# AWQ targets / ignore patterns (MoE-safe for THIS Qwen3 MoE naming)
# =====================================================
# Expert projections MUST be quantized together (gate_proj/up_proj/down_proj).
# Scope them to mlp.experts.<id>.* to avoid catching router gate or other "gate*" stuff.
EXPERT_GATE = r"re:.*mlp\.experts\.\d+\.gate_proj$"
EXPERT_UP = r"re:.*mlp\.experts\.\d+\.up_proj$"
EXPERT_DOWN = r"re:.*mlp\.experts\.\d+\.down_proj$"
# Attention projections (optional) β scope to self_attn to avoid accidental matches.
ATTN_Q = r"re:.*self_attn\.q_proj$"
ATTN_K = r"re:.*self_attn\.k_proj$"
ATTN_V = r"re:.*self_attn\.v_proj$"
ATTN_O = r"re:.*self_attn\.o_proj$"
if QUANT_MODE == "experts_only":
targets = [EXPERT_GATE, EXPERT_UP, EXPERT_DOWN]
elif QUANT_MODE == "experts_plus_attention":
targets = [ATTN_Q, ATTN_K, ATTN_V, ATTN_O, EXPERT_GATE, EXPERT_UP, EXPERT_DOWN]
else:
die("Invalid QUANT_MODE. Use 'experts_only' or 'experts_plus_attention'.")
# Ignore router gate + norms + embeddings/head + (optional) shared experts
ignore = [
# Output/embeddings
"lm_head",
"model.embed_tokens",
"re:.*embed_tokens.*",
# Norms (never quantize)
"re:.*norm.*",
"re:.*RMSNorm.*",
"re:.*layernorm.*",
"re:.*input_layernorm$",
"re:.*post_attention_layernorm$",
# Router gate (MoE routing) β KEEP FP16
# Your weights show: model.layers.X.mlp.gate.weight (shape [128, 2048])
"re:.*mlp\\.gate\\.weight$",
# Some models have q_norm/k_norm modules
"re:.*self_attn\\.q_norm.*",
"re:.*self_attn\\.k_norm.*",
"re:.*q_norm.*",
"re:.*k_norm.*",
# Shared experts (if present; safe to keep fp16)
"re:.*shared_expert.*",
"re:.*shared_experts.*",
# Other router-ish naming (defensive; should not match expert gate_proj now anyway)
"re:.*router.*",
"re:.*routing.*",
"re:.*moe.*gate.*",
]
recipe = [
AWQModifier(
ignore=ignore,
mappings=[], # CRITICAL: prevent auto-inference / smoothing for MoE
config_groups={
"group_0": {
"targets": targets,
"weights": {
"num_bits": NUM_BITS,
"type": "int",
"symmetric": SYMMETRIC,
"group_size": GROUP_SIZE,
"strategy": "group",
"dynamic": False,
},
"input_activations": None,
"output_activations": None,
"format": None,
}
},
)
]
# =====================================================
# Run AWQ
# =====================================================
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
log_gpu("before_awq")
try:
start = time.time()
model = oneshot(
model=model,
tokenizer=tokenizer,
dataset=calib_ds,
recipe=recipe,
num_calibration_samples=len(calib_data),
text_column="text",
max_seq_length=MAX_SEQ_LEN,
)
print(f"β AWQ finished in {time.time() - start:.1f}s")
log_gpu("after_awq")
except torch.cuda.OutOfMemoryError as e:
report_path = write_crash_report("CUDA OOM during AWQ", e)
print(f"π§Ύ Crash report written to: {report_path}", file=sys.stderr)
sys.exit(137)
except Exception as e:
report_path = write_crash_report("Unexpected error during AWQ", e)
print(f"π§Ύ Crash report written to: {report_path}", file=sys.stderr)
sys.exit(1)
# =====================================================
# Save final
# =====================================================
print(f"Saving final AWQ model β {OUT_DIR}")
model.save_pretrained(OUT_DIR, safe_serialization=True, save_compressed=SAVE_COMPRESSED_FINAL)
tokenizer.save_pretrained(OUT_DIR)
print("β Final model saved.")
if SAVE_BACKUP_COPY:
backup_dir = OUT_DIR + "_backup"
Path(backup_dir).mkdir(parents=True, exist_ok=True)
print(f"Saving backup copy β {backup_dir}")
model.save_pretrained(backup_dir, safe_serialization=True, save_compressed=SAVE_COMPRESSED_FINAL)
tokenizer.save_pretrained(backup_dir)
print("β Backup saved.")
print("\nβ DONE")
But this will get me rather some rambling results in vllm ( I used as calibration the Tesslate/Rust_Dataset 512 items, 2048 seq len)
Firstr it starts well (Explaining a jinja template):
and then goes off the rails FAST:
Any tips to get it to AWQ?
