attnvq / benchmark.py

update stale files

4e235ae about 1 month ago

34.8 kB

	"""
	benchmark.py — fit AttnVQ codebooks and evaluate on real Laguna-XS.2 caches.

	Stages:
	dump capture post-RoPE K/V from full-attention layers
	fit per-layer LBG codebooks → artifacts/codebooks.pt
	cheap proxy metrics (key cosine, attn-output error, ip-bias)
	swebench optional resolve-rate delta on SWE-bench Verified (needs Docker)

	Usage:
	python benchmark.py --stage fit
	python benchmark.py --stage cheap --n_eval 64
	python benchmark.py --stage dump
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import time
	from concurrent.futures import ThreadPoolExecutor
	from dataclasses import dataclass, asdict

	import torch

	from vqkv.quantizers import (ScalarKV, KIVIScalarKV, ProductVQKV, RoPESplitVQKV,
	SignScalarKV, TernaryScalarKV)
	from vqkv.metrics import (key_cosine, cache_mse, inner_product_distortion,
	attention_output, attn_output_cosine, attn_output_error,
	calibration_sample_weights)

	MODEL_ID = os.environ.get("LAGUNA_ID", "poolside/Laguna-XS.2")
	ARTIFACT_DIR = os.environ.get("VQKV_ARTIFACTS", "./artifacts")
	os.makedirs(ARTIFACT_DIR, exist_ok=True)

	CALIB_DATASET = os.environ.get("CALIB_DATASET", "SWE-bench/SWE-smith-trajectories")
	CALIB_SPLIT = os.environ.get("CALIB_SPLIT", "tool")
	CALIB_SOURCE = os.environ.get("CALIB_SOURCE", "swesmith") # swesmith \| longbench-hotpotqa

	_HOTPOTQA_PROMPT = (
	"Answer the question based on the given passages. "
	"Only give me the answer and do not output any other words.\n\n"
	"The following are given passages.\n{context}\n\n"
	"Answer the question based on the given passages. "
	"Only give me the answer and do not output any other words.\n\n"
	"Question: {input}\nAnswer:"
	)

	# ============================================================================
	# Cache configurations under test. Each is a (name, factory) where factory()
	# returns an unfitted quantizer. `None` means the fp16 baseline (no quant).
	# ============================================================================
	def cache_configs():
	return [
	("fp16 (baseline)", None),
	("scalar-int4", lambda: ScalarKV(nbits=4)),
	("scalar-int2", lambda: ScalarKV(nbits=2)),
	("kivi-int2", lambda: KIVIScalarKV(nbits=2)),
	("productvq-64x256-4b", lambda: ProductVQKV(n_sub=64, n_codes=256, iters=15)),
	("productvq-32x256-2b", lambda: ProductVQKV(n_sub=32, n_codes=256, iters=15)),
	("productvq-16x256-1b", lambda: ProductVQKV(n_sub=16, n_codes=256, iters=15)),
	("productvq-8x256-0.5b", lambda: ProductVQKV(n_sub=8, n_codes=256, iters=15)),
	("ropesplit-1b", lambda: RoPESplitVQKV(n_sub_half=8, n_codes=256, iters=15)),
	("sign-1bit", lambda: SignScalarKV(per_channel_key=True)),
	("ternary-bitnet", lambda: TernaryScalarKV(alpha=0.7, per_channel_key=True)),
	]


	# ============================================================================
	# Model + layer-structure loading
	# ============================================================================
	def load_model_and_meta():
	from transformers import AutoModelForCausalLM, AutoTokenizer
	tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda",
	trust_remote_code=True)
	model.eval()
	cfg = model.config
	full_layers = [i for i, t in enumerate(cfg.layer_types) if t == "full_attention"]
	meta = {
	"full_layers": full_layers,
	"n_kv_heads": cfg.num_key_value_heads,
	"n_q_heads": cfg.num_attention_heads,
	"head_dim": cfg.head_dim,
	"n_layers": cfg.num_hidden_layers,
	}
	print(f"[meta] full-attention layers ({len(full_layers)}): {full_layers}")
	print(f"[meta] kv_heads={meta['n_kv_heads']} head_dim={meta['head_dim']}")
	assert len(full_layers) > 0, "no full-attention layers found; check config"
	return model, tok, meta


	# Trace flattening (SWE-smith messages JSON, nebius trajectory/tool roles).
	def flatten_trace(example, tok) -> str:
	"""Flatten one trajectory to a string via the model's chat template."""
	raw = (example.get("messages") or example.get("trajectory")
	or example.get("conversations"))
	if raw is None:
	return json.dumps(example)[:200_000]

	# SWE-smith stores 'messages' as a JSON string, not a native list.
	if isinstance(raw, str):
	try:
	raw = json.loads(raw)
	except json.JSONDecodeError:
	return raw[:200_000]

	norm = []
	for m in raw:
	role = m.get("role") or m.get("from") or "user"
	content = m.get("content") or m.get("value") or ""

	# SWE-smith: content can be a list of {"type":"text","text":"..."} blocks
	if isinstance(content, list):
	content = "\n".join(
	item.get("text", str(item)) if isinstance(item, dict) else str(item)
	for item in content
	)

	# nebius: assistant turns carry tool_calls alongside content
	tool_calls = m.get("tool_calls")
	if tool_calls:
	tc_text = json.dumps(tool_calls, ensure_ascii=False)
	content = (content + "\n" + tc_text).strip() if content else tc_text

	# 'tool' role (observation) has no equivalent in most chat templates;
	# map it to 'user' so the template accepts it.
	role = {"human": "user", "gpt": "assistant", "tool": "user"}.get(role, role)

	if not content.strip():
	continue
	norm.append({"role": role, "content": content})

	# Merge consecutive same-role messages produced by tool->user collapsing.
	merged: list[dict] = []
	for m in norm:
	if merged and merged[-1]["role"] == m["role"]:
	merged[-1]["content"] += "\n\n" + m["content"]
	else:
	merged.append(dict(m))

	try:
	return tok.apply_chat_template(merged, tokenize=False,
	add_generation_prompt=False)
	except Exception:
	return "\n\n".join(f"{m['role']}: {m['content']}" for m in merged)


	def flatten_longbench(example) -> str:
	"""Format a LongBench hotpotqa example (context + input) as a plain string."""
	return _HOTPOTQA_PROMPT.format(
	context=example["context"], input=example["input"])


	def _load_longbench_hotpotqa():
	"""Load THUDM/LongBench hotpotqa, bypassing the deprecated dataset script."""
	from datasets import load_dataset as _ld
	for fname in ("hotpotqa_e.jsonl", "hotpotqa.jsonl"):
	try:
	return _ld(
	"json",
	data_files=f"hf://datasets/THUDM/LongBench/data/{fname}",
	split="train",
	)
	except Exception:
	continue
	return _ld("THUDM/LongBench", name="hotpotqa", split="test")


	def _load_calib_dump_dataset(n_calib: int, calib_source: str, tok):
	"""Return (dataset, text_fn, label) for stage_dump."""
	from datasets import load_dataset

	if calib_source == "longbench-hotpotqa":
	ds = _load_longbench_hotpotqa()
	n_total = len(ds)
	# Last n_calib rows avoid overlap with longbench_eval.py (range(0, n_eval)).
	start = max(0, n_total - n_calib)
	ds = ds.select(range(start, n_total))
	label = f"LongBench hotpotqa (rows {start}–{n_total - 1})"
	return ds, flatten_longbench, label

	ds = load_dataset(CALIB_DATASET, split=CALIB_SPLIT)
	label = f"{CALIB_DATASET} split={CALIB_SPLIT}"
	return ds, lambda ex: flatten_trace(ex, tok), label


	# STAGE: dump -- run real traces, capture post-RoPE K/V from full-attn layers
	def stage_dump(n_calib=16, max_len=32768, calib_source: str \| None = None,
	min_len=2048):
	from transformers.cache_utils import DynamicCache

	calib_source = calib_source or CALIB_SOURCE
	model, tok, meta = load_model_and_meta()
	full = set(meta["full_layers"])

	# device_map="auto" can leave model.device as 'meta'; always use cuda:0.
	input_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	class DumpingCache(DynamicCache):
	def __init__(self, a, *k):
	super().__init__(a, *k)
	self.dump = {i: {"k": [], "v": []} for i in full}
	def update(self, key_states, value_states, layer_idx, cache_kwargs=None):
	if layer_idx in full:
	self.dump[layer_idx]["k"].append(
	key_states.detach()[0].permute(1, 0, 2).float().cpu())
	self.dump[layer_idx]["v"].append(
	value_states.detach()[0].permute(1, 0, 2).float().cpu())
	return super().update(key_states, value_states, layer_idx, cache_kwargs)

	ds, get_text, source_label = _load_calib_dump_dataset(n_calib, calib_source, tok)
	print(f"[dump] source={calib_source} {source_label} rows={len(ds)} "
	f"schema={list(ds[0].keys())}")

	agg = {i: {"k": [], "v": []} for i in full}
	used = 0
	for ex in ds:
	if used >= n_calib:
	break
	text = get_text(ex)
	ids = tok(text, return_tensors="pt", truncation=True,
	max_length=max_len).to(input_device)
	if ids["input_ids"].shape[1] < min_len:
	continue
	cache = DumpingCache(config=model.config) # fresh per trace
	with torch.no_grad():
	model.model(**ids, past_key_values=cache, use_cache=True) # skip lm_head
	for i in full:
	agg[i]["k"].append(torch.cat(cache.dump[i]["k"]))
	agg[i]["v"].append(torch.cat(cache.dump[i]["v"]))
	used += 1
	print(f"[dump] trace {used}/{n_calib} len={ids['input_ids'].shape[1]}")

	calib = {i: {"k": torch.cat(agg[i]["k"]), "v": torch.cat(agg[i]["v"])}
	for i in full}
	for i in list(full)[:3]:
	k = calib[i]["k"].reshape(-1, meta["head_dim"])
	cut = meta["head_dim"] // 2
	print(f"[dump] layer {i}: {tuple(calib[i]['k'].shape)} \| "
	f"rope-half std {k[:, :cut].std():.3f} "
	f"pass-half std {k[:, cut:].std():.3f} "
	f"max\|ch\| {k.abs().amax(0).max():.2f}")
	path = os.path.join(ARTIFACT_DIR, "calib_caches.pt")
	torch.save({"calib": calib, "meta": meta}, path)
	print(f"[dump] saved -> {path}")


	# STAGE: fit -- per-layer codebooks for every (data-dependent) config
	def stage_fit(only: list[str] \| None = None):
	"""Fit quantizers and write artifacts/codebooks.pt.

	By default fits every config in cache_configs(). Pass ``only=["sign-1bit", ...]``
	to fit just those names and merge into an existing codebooks.pt (skips the rest).
	Tuning-free quantizers (Sign, Ternary, Scalar, KIVI) finish in seconds; only
	ProductVQ / RoPE-split need calib_caches.pt.
	"""
	codebooks_path = os.path.join(ARTIFACT_DIR, "codebooks.pt")
	calib_path = os.path.join(ARTIFACT_DIR, "calib_caches.pt")

	if only and os.path.exists(codebooks_path):
	existing = torch.load(codebooks_path, weights_only=False)
	fitted = existing["fitted"]
	meta = existing["meta"]
	print(f"[fit] merging into existing {codebooks_path} ({len(fitted)} configs)")
	else:
	if not os.path.exists(calib_path):
	raise FileNotFoundError(
	f"{calib_path} not found; run --stage dump first, or use "
	f"--only with an existing codebooks.pt for tuning-free configs")
	blob = torch.load(calib_path)
	calib, meta = blob["calib"], blob["meta"]
	fitted = {}

	calib = None
	if os.path.exists(calib_path):
	calib = torch.load(calib_path)["calib"]

	hd = meta["head_dim"]
	layer_ids = meta["full_layers"]

	configs = [(n, f) for n, f in cache_configs() if f is not None]
	if only is not None:
	only_set = set(only)
	configs = [(n, f) for n, f in configs if n in only_set]
	unknown = only_set - {n for n, _ in configs}
	if unknown:
	raise ValueError(f"unknown --only config(s): {sorted(unknown)}")

	# GPU fitting: LBG is pure torch; moving calib to CUDA makes bmm/argmin ~20x
	# faster. Serial over layers on GPU (CUDA is already async; threading adds no
	# benefit). Parallel over layers on CPU (BLAS releases GIL; real concurrency).
	fit_device = "cuda" if torch.cuda.is_available() else "cpu"
	if fit_device == "cuda":
	print(f"[fit] GPU available ({torch.cuda.get_device_name()}) -- fitting LBG on CUDA")
	n_workers = 1 if fit_device == "cuda" else min(len(layer_ids), os.cpu_count() or 1)

	attn_weighted = any(
	isinstance(f(), (ProductVQKV, RoPESplitVQKV)) for _, f in configs
	)
	if attn_weighted:
	print("[fit] ProductVQ / RoPESplit: attention-weighted LBG "
	"(centroids weighted by key attention mass)")

	n_q = meta.get("n_q_heads", 48)
	for name, factory in configs:
	t0 = time.time()

	def _fit_layer(i, _factory=factory, _device=fit_device):
	if calib is not None and i in calib:
	k_struct = calib[i]["k"]
	v_struct = calib[i]["v"]
	if k_struct.shape[0] > 512:
	k_struct = k_struct[-512:]
	v_struct = v_struct[-512:]
	w = calibration_sample_weights(k_struct, n_q)
	kf = k_struct.reshape(-1, hd)[:200_000].to(_device)
	vf = v_struct.reshape(-1, hd)[:200_000].to(_device)
	if w is not None:
	w = w[:kf.shape[0]].to(_device)
	else:
	kf = torch.zeros(1, hd, device=_device)
	vf = torch.zeros(1, hd, device=_device)
	w = k_struct = None
	q = _factory()
	if isinstance(q, (ProductVQKV, RoPESplitVQKV)):
	q.fit(kf, vf, sample_weights=w, n_q_heads=n_q,
	k_struct=k_struct.to(_device) if k_struct is not None else None)
	else:
	q.fit(kf, vf)
	# Codebooks are saved to disk as CPU tensors; move back before returning.
	if _device != "cpu" and hasattr(q, "to"):
	q.to("cpu")
	return i, q

	with ThreadPoolExecutor(max_workers=n_workers) as ex:
	per_layer = dict(ex.map(_fit_layer, layer_ids))

	fitted[name] = per_layer
	print(f"[fit] {name}: {len(per_layer)} layer-codebooks in {time.time()-t0:.1f}s")

	torch.save({"fitted": fitted, "meta": meta}, codebooks_path)
	print(f"[fit] saved -> {codebooks_path} ({len(fitted)} configs total)")


	# A drop-in cache that applies a per-layer quantizer to the target layers only.
	def make_vq_cache_class(per_layer_quantizers, target_layers, model_config, device=None):
	"""Build a VQCache class with codebooks pre-moved to `device`.

	Pass device="cuda" (or the model's device) so the roundtrip runs entirely
	on GPU with no CPU<->GPU transfers. Without this, every update() call
	implicitly transfers key_states to CPU and back, making decode very slow.
	"""
	from transformers.cache_utils import DynamicCache

	if device is not None:
	for q in per_layer_quantizers.values():
	if hasattr(q, "to"):
	q.to(device)

	class VQCache(DynamicCache):
	def __init__(self, a, *k):
	super().__init__(a, *k)
	self.q = per_layer_quantizers
	self.target = set(target_layers)
	def update(self, key_states, value_states, layer_idx, cache_kwargs=None):
	if layer_idx in self.target and layer_idx in self.q:
	q = self.q[layer_idx]
	b, h, s, d = key_states.shape
	kf = key_states[0].transpose(0, 1).reshape(-1, d).float()
	vf = value_states[0].transpose(0, 1).reshape(-1, d).float()
	# See stage_cheap: per-channel-key quantizers reduce the key
	# along the token axis and need a per-head block, not a flatten.
	per_channel_key = (
	isinstance(q, KIVIScalarKV)
	or getattr(q, "per_channel_key", False)
	)
	if per_channel_key:
	kk = key_states[0].permute(1, 0, 2).float() # (s, h, d)
	k_hat = torch.stack([q.roundtrip_k(kk[:, hh, :]) for hh in range(h)], 1)
	vv = value_states[0].permute(1, 0, 2).float()
	v_hat = torch.stack([q.roundtrip_v(vv[:, hh, :]) for hh in range(h)], 1)
	k_hat = k_hat.permute(1, 0, 2)[None]
	v_hat = v_hat.permute(1, 0, 2)[None]
	else:
	k_hat = q.roundtrip_k(kf).reshape(s, h, d).permute(1, 0, 2)[None]
	v_hat = q.roundtrip_v(vf).reshape(s, h, d).permute(1, 0, 2)[None]
	key_states = k_hat.to(key_states.dtype).to(key_states.device)
	value_states = v_hat.to(value_states.dtype).to(value_states.device)
	return super().update(key_states, value_states, layer_idx, cache_kwargs)
	return VQCache


	# STAGE: cheap -- tier-1/2 metrics on held-out trace windows (no test suites)
	def stage_cheap(n_eval=64, max_len=16384):
	import collections
	from datasets import load_dataset
	blob = torch.load(os.path.join(ARTIFACT_DIR, "codebooks.pt"), weights_only=False)
	fitted, meta = blob["fitted"], blob["meta"]
	hd, full = meta["head_dim"], meta["full_layers"]
	n_q = meta.get("n_q_heads", 48) # fallback for codebooks.pt written before this field

	# Window for O(T²) attention metrics. 512 tokens keeps peak mem <200MB on GPU.
	ATTN_WIN = 512

	model, tok, _ = load_model_and_meta()
	from tqdm import tqdm
	from transformers.cache_utils import DynamicCache

	# Move every fitted codebook to the model device ONCE. The roundtrip then
	# runs entirely on-GPU against the on-GPU eval caches (see EvalDump below).
	# ScalarKV/KIVI are tuning-free and have no .to(); they follow their input.
	for per_layer in fitted.values():
	for q in per_layer.values():
	if hasattr(q, "to"):
	q.to(model.device)

	class EvalDump(DynamicCache):
	def __init__(self):
	super().__init__(); self.d = {i: {} for i in full}
	def update(self, ks, vs, li, ck=None):
	if li in set(full):
	# Keep on-GPU: the quantizer roundtrip is a nearest-neighbour
	# search -- a GPU job. Moving to CPU here dominated runtime.
	self.d[li]["k"] = ks.detach()[0].permute(1, 0, 2).float()
	self.d[li]["v"] = vs.detach()[0].permute(1, 0, 2).float()
	return super().update(ks, vs, li, ck)

	ds = load_dataset(CALIB_DATASET, split=CALIB_SPLIT)

	# Per-trace rows (not saved to disk; aggregated below)
	trace_rows = []

	# Held-out slice. Start at 500 to safely clear any rows the dump stage
	# consumed (dump skips short traces, so actual rows used >> n_calib=16).
	for ex in tqdm(ds.select(range(500, 500 + n_eval))):
	text = flatten_trace(ex, tok)
	ids = tok(text, return_tensors="pt", truncation=True, max_length=max_len).to(model.device)
	cache = EvalDump()
	with torch.no_grad():
	# model.model() skips lm_head: avoids a ~3 GB allocation per trace
	# (max_len * vocab_size * 2 bytes) that is not needed for KV metrics.
	model.model(**ids, past_key_values=cache, use_cache=True)

	# Synthetic Q for attention/IP metrics: generated once per (trace, layer)
	# and reused across all configs so the comparison is apples-to-apples.
	# Unit-normalised so inner-product scale doesn't swamp the bias signal.
	synth_q = {}
	for i in full:
	s = cache.d[i]["k"].shape[0]
	win = min(s, ATTN_WIN)
	q_rand = torch.randn(win, n_q, hd, device=cache.d[i]["k"].device)
	synth_q[i] = q_rand / q_rand.norm(dim=-1, keepdim=True).clamp_min(1e-8)

	for name, _ in cache_configs():
	if name == "fp16 (baseline)":
	continue
	per_layer = fitted[name]
	acc = collections.defaultdict(float)
	nL = 0
	for i in full:
	k = cache.d[i]["k"] # (s, h, d)
	v = cache.d[i]["v"]
	q = per_layer[i]
	s, h, d = k.shape

	# Per-channel-key quantizers (KIVI, Sign, Ternary with
	# per_channel_key=True) reduce the KEY along the TOKEN axis
	# (dim=0). They must see one (s, d) block PER HEAD; flattening
	# (s,h,d)->(s*h,d) would mix tokens across heads into one scale
	# and corrupt the key metric. VQ/scalar-per-token quantizers
	# reduce along dim=-1 and are safe to flatten.
	per_channel_key = (
	isinstance(q, KIVIScalarKV)
	or getattr(q, "per_channel_key", False)
	)
	if per_channel_key:
	k_hat = torch.stack([q.roundtrip_k(k[:, hh, :]) for hh in range(h)], 1)
	v_hat = torch.stack([q.roundtrip_v(v[:, hh, :]) for hh in range(h)], 1)
	else:
	k_hat = q.roundtrip_k(k.reshape(-1, d)).reshape(s, h, d)
	v_hat = q.roundtrip_v(v.reshape(-1, d)).reshape(s, h, d)

	acc["key_cos"] += key_cosine(k, k_hat)
	acc["val_cos"] += key_cosine(v, v_hat)
	acc["key_mse"] += cache_mse(k, k_hat)
	acc["val_mse"] += cache_mse(v, v_hat)

	# Windowed attention/IP metrics on last ATTN_WIN tokens
	win = min(s, ATTN_WIN)
	kw, kw_hat = k[-win:], k_hat[-win:]
	vw, vw_hat = v[-win:], v_hat[-win:]
	q_syn = synth_q[i] # (win, n_q, d)

	out_ref, _ = attention_output(q_syn, kw, vw, n_q)
	out_hat, _ = attention_output(q_syn, kw_hat, vw_hat, n_q)
	acc["attn_cos"] += attn_output_cosine(out_ref, out_hat)
	acc["attn_output_error"] += attn_output_error(out_ref, out_hat)

	ip = inner_product_distortion(q_syn, kw, kw_hat)
	acc["ip_rel"] += ip["ip_rel_err"]
	acc["ip_bias"] += ip["ip_bias"]

	nL += 1

	trace_rows.append({
	"trace_len": ids["input_ids"].shape[1],
	"config": name,
	"key_cos": acc["key_cos"] / nL,
	"val_cos": acc["val_cos"] / nL,
	"key_mse": acc["key_mse"] / nL,
	"val_mse": acc["val_mse"] / nL,
	"attn_cos": acc["attn_cos"] / nL,
	"attn_output_error": acc["attn_output_error"] / nL,
	"ip_rel": acc["ip_rel"] / nL,
	"ip_bias": acc["ip_bias"] / nL,
	})

	# Aggregate across traces: one summary row per config (what gets saved)
	agg = collections.defaultdict(lambda: collections.defaultdict(list))
	for r in trace_rows:
	for col in ("key_cos", "val_cos", "key_mse", "val_mse",
	"attn_cos", "attn_output_error", "ip_rel", "ip_bias"):
	agg[r["config"]][col].append(r[col])

	COLS = ("key_cos", "val_cos", "key_mse", "val_mse",
	"attn_cos", "attn_output_error", "ip_rel", "ip_bias")

	summary = []
	for name, _ in cache_configs():
	if name not in agg:
	continue
	cols = agg[name]
	n = len(cols["key_cos"])
	q0 = next(iter(fitted[name].values()))
	row = {"config": name, "bits_per_elt": round(q0.bits_per_element(hd), 4), "n_traces": n}
	for col in COLS:
	row[col] = round(sum(cols[col]) / n, 5)
	summary.append(row)

	print(f"\n[cheap] mean metrics over {n_eval} held-out traces:")
	print(f" {'config':24s} {'bpe':>5} {'key_cos':>8} {'val_cos':>8} "
	f"{'key_mse':>9} {'val_mse':>9} {'attn_cos':>9} {'attn_err':>9} "
	f"{'ip_rel':>8} {'ip_bias':>9}")
	for row in summary:
	print(f" {row['config']:24s} {row['bits_per_elt']:5.2f} "
	f"{row['key_cos']:8.4f} {row['val_cos']:8.4f} "
	f"{row['key_mse']:9.5f} {row['val_mse']:9.5f} "
	f"{row['attn_cos']:9.4f} {row['attn_output_error']:9.4f} "
	f"{row['ip_rel']:8.5f} {row['ip_bias']:9.6f}")

	out_path = os.path.join(ARTIFACT_DIR, "cheap_metrics.json")
	json.dump(summary, open(out_path, "w"), indent=2)
	print(f"[cheap] saved -> {out_path}")


	# Optional SWE-bench Verified eval (requires Docker + swebench).
	_AGENT_SYSTEM = (
	"You are an expert software engineer fixing a GitHub issue. "
	"You have a bash shell inside the repository checked out at the failing commit. "
	"Use <bash>command</bash> tags to run shell commands. "
	"Explore the code, implement the fix, then output <submit> when done."
	)


	def _agent_loop(model, tok, task: dict, cache_factory, max_turns: int,
	max_new: int = 1024) -> str:
	"""Run a minimal ReAct-bash loop on one SWE-bench task.

	Clones the repo at base_commit into a temp dir, runs the model in a
	generate→bash→observe loop, and returns the final `git diff HEAD` patch.
	Each generate call rebuilds the full context from scratch so the VQCache
	sees the compounding long-context pressure that the project targets.
	"""
	import re
	import shutil
	import subprocess
	import tempfile

	repo_dir = tempfile.mkdtemp(prefix="sweagent_")
	try:
	subprocess.run(
	["git", "clone", f"https://github.com/{task['repo']}.git", repo_dir],
	check=True, capture_output=True, timeout=120,
	)
	subprocess.run(
	["git", "checkout", task["base_commit"]],
	check=True, capture_output=True, cwd=repo_dir, timeout=30,
	)
	except Exception as exc:
	print(f" [agent] clone/checkout failed for {task['instance_id']}: {exc}")
	shutil.rmtree(repo_dir, ignore_errors=True)
	return ""

	try:
	messages = [
	{"role": "system", "content": _AGENT_SYSTEM},
	{"role": "user", "content": (
	f"Repository: {task['repo']}\n\n"
	f"Issue:\n{task['problem_statement']}"
	)},
	]

	for _ in range(max_turns):
	prompt = tok.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True)
	ids = tok(prompt, return_tensors="pt", truncation=True,
	max_length=32768).to(model.device)

	cache = cache_factory()
	with torch.no_grad():
	out = model.generate(
	**ids, max_new_tokens=max_new, do_sample=False,
	past_key_values=cache, use_cache=True,
	)
	gen = tok.decode(out[0, ids["input_ids"].shape[1]:],
	skip_special_tokens=True)
	messages.append({"role": "assistant", "content": gen})

	if re.search(r"<submit\s*/?>", gen, re.I):
	break

	cmds = re.findall(r"<bash>(.*?)</bash>", gen, re.DOTALL)
	if not cmds:
	break # model stopped issuing commands; take whatever diff we have

	obs_parts = []
	for cmd in cmds:
	try:
	r = subprocess.run(
	cmd, shell=True, capture_output=True, text=True,
	timeout=30, cwd=repo_dir,
	)
	obs_parts.append(
	f"$ {cmd.strip()}\n{(r.stdout + r.stderr)[:2000]}")
	except subprocess.TimeoutExpired:
	obs_parts.append(f"$ {cmd.strip()}\n[timeout after 30s]")
	messages.append({"role": "user", "content": "\n\n".join(obs_parts)})

	diff = subprocess.run(
	["git", "diff", "HEAD"],
	capture_output=True, text=True, cwd=repo_dir,
	)
	return diff.stdout
	finally:
	shutil.rmtree(repo_dir, ignore_errors=True)


	def run_swebench_subset(model, tok, cache_factory_per_layer, target_layers,
	task_ids, max_turns=100):
	"""Run the official SWE-bench Verified harness over `task_ids`.

	Builds per-config VQCache (or None for fp16), runs the mini-bash agent on
	each task, writes predictions.jsonl, evaluates with the swebench harness,
	and returns resolve rate in [0, 1]. Keep `task_ids` FIXED across configs.
	"""
	import glob
	from datasets import load_dataset

	# Build cache factory: VQCache for quantized configs, None for fp16.
	if cache_factory_per_layer is not None:
	VQCls = make_vq_cache_class(cache_factory_per_layer, target_layers,
	model.config, device=model.device)
	def cache_factory(): return VQCls()
	else:
	def cache_factory(): return None # model allocates DynamicCache internally

	# Load task metadata indexed by instance_id.
	verified = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
	tasks = {r["instance_id"]: r for r in verified
	if r["instance_id"] in set(task_ids)}

	# Generate patches with the agent.
	run_id = f"vqkv_{int(time.time())}"
	predictions = []
	for task_id in task_ids:
	task = tasks.get(task_id)
	if task is None:
	print(f"[swebench] {task_id}: not in Verified dataset, skipping")
	continue
	print(f"[swebench] {task_id} ({len(predictions)+1}/{len(task_ids)})")
	patch = _agent_loop(model, tok, task, cache_factory, max_turns=max_turns)
	predictions.append({
	"instance_id": task_id,
	"model_patch": patch,
	"model_name_or_path": "laguna-vqkv",
	})

	preds_path = os.path.join(ARTIFACT_DIR, f"{run_id}.jsonl")
	with open(preds_path, "w") as f:
	for p in predictions:
	f.write(json.dumps(p) + "\n")
	print(f"[swebench] wrote {len(predictions)} predictions -> {preds_path}")

	# Run the official harness (needs Docker daemon).
	# pip install swebench
	from swebench.harness.run_evaluation import main as run_evaluation
	run_evaluation(
	dataset_name_or_path="princeton-nlp/SWE-bench_Verified",
	split="test",
	instance_ids=task_ids,
	predictions_path=preds_path,
	max_workers=4,
	force_rebuild=False,
	cache_level="env",
	clean=False,
	open_file_limit=4096,
	run_id=run_id,
	timeout=1800,
	)

	# Parse results. swebench writes a JSON summary; location varies by version.
	result_files = (
	glob.glob(os.path.join(ARTIFACT_DIR, f"{run_id}*.json"))
	+ glob.glob(f"{run_id}*.json") # also check cwd
	)
	if not result_files:
	print(f"[swebench] WARNING: no results file found for run_id={run_id}")
	return 0.0

	with open(result_files[0]) as f:
	results = json.load(f)

	if isinstance(results, list):
	n_resolved = sum(1 for r in results if r.get("resolved", False))
	elif isinstance(results, dict):
	# some harness versions use {instance_id: {resolved: bool, ...}}
	n_resolved = sum(1 for v in results.values()
	if (v.get("resolved") if isinstance(v, dict) else v))
	else:
	n_resolved = 0

	return n_resolved / len(task_ids)


	def stage_swebench(n_tasks=50, seed=0):
	import random
	from datasets import load_dataset
	blob = torch.load(os.path.join(ARTIFACT_DIR, "codebooks.pt"))
	fitted, meta = blob["fitted"], blob["meta"]
	model, tok, _ = load_model_and_meta()

	verified = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
	rng = random.Random(seed)
	task_ids = [verified[i]["instance_id"]
	for i in rng.sample(range(len(verified)), n_tasks)]
	json.dump(task_ids, open(os.path.join(ARTIFACT_DIR, "task_subset.json"), "w"))
	print(f"[swebench] fixed subset of {n_tasks} tasks (seed={seed}) saved.")

	results = []
	for name, _ in cache_configs():
	quantizers = None if name == "fp16 (baseline)" else fitted[name]
	try:
	rate = run_swebench_subset(model, tok, quantizers,
	meta["full_layers"], task_ids)
	except NotImplementedError as e:
	print(f"[swebench] {name}: STUB -- {e}")
	rate = None
	results.append({"config": name, "resolve_rate": rate})
	print(f"[swebench] {name}: resolve_rate={rate}")

	# report DELTAS vs fp16 (robust to absolute-score contamination)
	base = next((r["resolve_rate"] for r in results
	if r["config"] == "fp16 (baseline)"), None)
	print("\n[swebench] resolve rate on fixed subset (delta vs fp16):")
	for r in results:
	d = (None if (r["resolve_rate"] is None or base is None)
	else round(r["resolve_rate"] - base, 4))
	print(f" {r['config']:24s} {r['resolve_rate']} (Δ {d})")
	json.dump(results, open(os.path.join(ARTIFACT_DIR, "swebench_results.json"), "w"),
	indent=2)


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--stage", required=True,
	choices=["dump", "fit", "cheap", "swebench"])
	ap.add_argument("--n_calib", type=int, default=16)
	ap.add_argument(
	"--calib_source", type=str, default=None,
	choices=["swesmith", "longbench-hotpotqa"],
	help="dump stage: calibration corpus (default: CALIB_SOURCE env or swesmith)")
	ap.add_argument("--n_eval", type=int, default=64)
	ap.add_argument("--n_tasks", type=int, default=50)
	ap.add_argument(
	"--only", type=str, default=None,
	help="fit stage: comma-separated config names to fit/merge (e.g. "
	"sign-1bit,ternary-bitnet). Skips refitting other configs.")
	args = ap.parse_args()

	if args.stage == "dump":
	stage_dump(n_calib=args.n_calib, calib_source=args.calib_source)
	elif args.stage == "fit":
	only = [s.strip() for s in args.only.split(",")] if args.only else None
	stage_fit(only=only)
	elif args.stage == "cheap":
	stage_cheap(n_eval=args.n_eval)
	elif args.stage == "swebench":
	stage_swebench(n_tasks=args.n_tasks)


	if __name__ == "__main__":
	main()