mxguru1
/

hsaq-tools

Model card Files Files and versions

xet

Community

mxguru1 commited on 8 days ago

Commit

fa1f4fa

verified ·

1 Parent(s): 55f5f5e

Add KV interception hooks + generalised allocator + smoke tests (3/3: smoke_test_v3.py)

Browse files

Files changed (1) hide show

smoke_test_v3.py +289 -0

smoke_test_v3.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+Smoke test for assignment_v2 + kv_intercept.
+Coverage:
+  1. Back-compat: old assign_bit_widths API still works.
+  2. Generic core: assign_greedy with arbitrary (cost, unit) pairs.
+  3. KV allocator: assign_kv_bits respects KV-cache budget at max_seq_len.
+  4. Two-budget combined: assign_combined runs both independently.
+  5. KV interception hook: forward hooks on k_proj/v_proj actually
+     modify attention output, with drift ordered by bit width.
+  6. KV interception multi-layer: kv_quant_active_multi installs and
+     tears down cleanly.
+Run: place assignment_v2.py and kv_intercept.py in the same directory,
+     then `python smoke_test_v3.py`.
+"""
+import sys
+import logging
+from collections import Counter
+from pathlib import Path
+import torch
+import torch.nn as nn
+# Make sibling modules importable regardless of where the script is run from.
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+import assignment_v2 as asgn
+import kv_intercept as kvi
+def hr(title):
+    print(f"\n{'=' * 6} {title} {'=' * 6}")
+# ===========================================================================
+# 1. Back-compat: old API
+# ===========================================================================
+hr("1. Back-compat: assign_bit_widths still works")
+def opts(d2, d3, d4):
+    return [
+        asgn.LayerOption(bits=2, quantizer="hqq", drift=d2, bytes_per_param=(2/8)*1.07),
+        asgn.LayerOption(bits=3, quantizer="hqq", drift=d3, bytes_per_param=(3/8)*1.07),
+        asgn.LayerOption(bits=4, quantizer="hqq", drift=d4, bytes_per_param=(4/8)*1.07),
+    ]
+candidates = [
+    asgn.LayerCandidate(0, "attn", 100_000_000, opts(1.20, 0.40, 0.05)),
+    asgn.LayerCandidate(1, "mlp",  200_000_000, opts(0.50, 0.15, 0.08)),
+    asgn.LayerCandidate(2, "attn", 100_000_000, opts(0.12, 0.10, 0.09)),
+]
+result = asgn.assign_bit_widths(candidates, weight_budget_gb=0.15)
+print(f"  total_weights_gb: {result.total_weights_gb:.4f}")
+print(f"  total_drift:      {result.total_drift:.3f}")
+print(f"  saturated:        {result.saturated}")
+for a in result.assignments:
+    print(f"  L{a.layer_idx} {a.component:<5} -> {a.chosen.bits}-bit ({a.chosen.quantizer}), drift={a.chosen.drift:.3f}")
+# Sensitive layer (0) should be at least as high-bit as tolerant layer (2)
+by_layer = result.by_layer
+assert by_layer[(0, "attn")].chosen.bits >= by_layer[(2, "attn")].chosen.bits
+print("  v1 API back-compat verified ✓")
+# ===========================================================================
+# 2. Generic core
+# ===========================================================================
+hr("2. Generic assign_greedy with arbitrary cost/unit pairs")
+# Simulate something completely unlike weights: 5 candidates each with
+# unit_count = 1 (so cost_per_unit IS the total cost) and different drifts.
+gcands = [
+    asgn.GenericCandidate(
+        candidate_id=("act", i),
+        unit_count=1,
+        options=[
+            asgn.GenericOption(cost_per_unit=1.0e8, drift=1.0, label=("a",)),
+            asgn.GenericOption(cost_per_unit=2.0e8, drift=0.5, label=("b",)),
+            asgn.GenericOption(cost_per_unit=4.0e8, drift=0.1, label=("c",)),
+        ],
+    )
+    for i in range(5)
+]
+# Budget for 5 × cheap (0.5 GB) + room for 2 upgrades to 'b' (extra 0.2 GB)
+gen_result = asgn.assign_greedy(gcands, budget_bytes=0.7e9)
+print(f"  total_bytes: {gen_result.total_bytes / 1e9:.3f} GB / {gen_result.budget_gb:.3f} GB budget")
+print(f"  drift:       {gen_result.total_drift:.3f}")
+print(f"  saturated:   {gen_result.saturated}")
+labels = Counter(a.chosen.label for a in gen_result.assignments)
+print(f"  label distribution: {dict(labels)}")
+assert gen_result.total_bytes <= gen_result.budget_bytes
+# ===========================================================================
+# 3. KV allocator
+# ===========================================================================
+hr("3. assign_kv_bits respects KV-cache budget at max_seq_len")
+# OLMo-like shape: 40 layers, 40 KV heads, 128 head_dim → 800 KB/token fp16
+NUM_KV_HEADS = 40
+HEAD_DIM = 128
+NUM_LAYERS = 40
+MAX_SEQ = 4096
+def kv_opts(num_kv_heads, head_dim):
+    """Generate 4 KV options per layer: fp16, 8-bit, 4-bit, 2-bit hqq_g64."""
+    elems = num_kv_heads * head_dim
+    group_size = 64
+    groups = max(1, elems // group_size)
+    hqq_overhead = groups * 2 * 2  # 2 (zero+scale) × 2 bytes per group
+    def bpt(k_bits, v_bits, overhead):
+        k = elems * k_bits / 8 + overhead
+        v = elems * v_bits / 8 + overhead
+        return k + v
+    return [
+        # Drift values: arbitrary but ordered. Reality would have these measured.
+        asgn.KVOption(16, 16, "fp16_passthrough", drift=0.000, bytes_per_kv_token=elems*4),
+        asgn.KVOption(8, 8,  "hqq_g64",          drift=0.005, bytes_per_kv_token=bpt(8, 8,  hqq_overhead)),
+        asgn.KVOption(4, 4,  "hqq_g64",          drift=0.030, bytes_per_kv_token=bpt(4, 4,  hqq_overhead)),
+        asgn.KVOption(2, 4,  "hqq_g64",          drift=0.080, bytes_per_kv_token=bpt(2, 4,  hqq_overhead)),
+    ]
+kv_cands = [
+    asgn.KVCandidate(layer_idx=i, num_kv_heads=NUM_KV_HEADS, head_dim=HEAD_DIM,
+                     options=kv_opts(NUM_KV_HEADS, HEAD_DIM))
+    for i in range(NUM_LAYERS)
+]
+# Budget: 2.0 GB (between all-2/4-bit (~0.8 GB) and all-fp16 (~3.3 GB))
+kv_result = asgn.assign_kv_bits(kv_cands, kv_budget_gb=2.0, max_seq_len=MAX_SEQ)
+print(f"  Layers: {NUM_LAYERS}, max_seq_len: {MAX_SEQ}")
+print(f"  KV used: {kv_result.total_kv_gb:.3f} / {kv_result.budget_gb:.3f} GB")
+print(f"  drift:   {kv_result.total_drift:.4f}")
+print(f"  saturated: {kv_result.saturated}")
+bits_hist = Counter((a.chosen.k_bits, a.chosen.v_bits) for a in kv_result.assignments)
+print(f"  (k_bits, v_bits) distribution: {dict(bits_hist)}")
+assert kv_result.total_kv_gb <= kv_result.budget_gb
+assert len(bits_hist) >= 1
+# ===========================================================================
+# 4. Two-budget combined
+# ===========================================================================
+hr("4. assign_combined: weights and KV under independent budgets")
+# Reuse candidates from earlier
+combined = asgn.assign_combined(
+    weight_candidates=[c.to_generic() for c in candidates],
+    kv_candidates=[c.to_generic(MAX_SEQ) for c in kv_cands],
+    weight_budget_bytes=0.15e9,
+    kv_budget_bytes=2.0e9,
+)
+print(f"  weight total: {combined.weights.total_gb:.3f} GB / 0.15 GB budget")
+print(f"  KV total:     {combined.kv.total_gb:.3f} GB / 2.00 GB budget")
+print(f"  combined drift: {combined.total_drift:.4f}")
+# The two are independent — verify by checking the totals match the sums
+assert abs(combined.total_drift - (combined.weights.total_drift + combined.kv.total_drift)) < 1e-9
+assert combined.weights.total_bytes <= 0.15e9
+assert combined.kv.total_bytes <= 2.0e9
+print("  weight and KV pools independent, both within budget ✓")
+# ===========================================================================
+# 5. KV interception hook actually modifies attention output
+# ===========================================================================
+hr("5. K/V interception hook changes attention output")
+# Build a minimal Llama-family attention module: just k_proj and v_proj as
+# nn.Linear, then a fake attention computation that uses them.
+class TinyAttn(nn.Module):
+    """Mimics Llama-family self_attn surface: q_proj, k_proj, v_proj, o_proj."""
+    def __init__(self, hidden=128, num_heads=4, num_kv_heads=4):
+        super().__init__()
+        self.hidden = hidden
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = hidden // num_heads
+        self.q_proj = nn.Linear(hidden, num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(hidden, num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(hidden, num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(num_heads * self.head_dim, hidden, bias=False)
+    def forward(self, x):
+        b, s, _ = x.shape
+        q = self.q_proj(x).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(b, s, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(b, s, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        # Simple scaled-dot-product attention (no GQA repeat needed since
+        # num_heads == num_kv_heads here for the test)
+        attn = torch.softmax(q @ k.transpose(-2, -1) / (self.head_dim ** 0.5), dim=-1)
+        out = (attn @ v).transpose(1, 2).reshape(b, s, -1)
+        return self.o_proj(out)
+torch.manual_seed(0)
+attn = TinyAttn()
+attn.eval()
+x = torch.randn(1, 32, 128)
+with torch.no_grad():
+    baseline = attn(x).clone()
+# 4-bit KV
+spec_4 = kvi.KVQuantSpec(k_bits=4, v_bits=4, quantizer="hqq_g64", group_size=64)
+with kvi.kv_quant_active(attn, spec_4), torch.no_grad():
+    out_4bit = attn(x).clone()
+# 2-bit KV
+spec_2 = kvi.KVQuantSpec(k_bits=2, v_bits=2, quantizer="hqq_g64", group_size=64)
+with kvi.kv_quant_active(attn, spec_2), torch.no_grad():
+    out_2bit = attn(x).clone()
+# After context exits, hooks should be removed — verify by re-running
+with torch.no_grad():
+    after = attn(x).clone()
+drift_4 = ((out_4bit - baseline) ** 2).mean().item()
+drift_2 = ((out_2bit - baseline) ** 2).mean().item()
+drift_after = ((after - baseline) ** 2).mean().item()
+print(f"  attention output drift at 4-bit KV: {drift_4:.6e}")
+print(f"  attention output drift at 2-bit KV: {drift_2:.6e}")
+print(f"  drift after context exit (should be 0): {drift_after:.6e}")
+assert drift_4 > 0, "4-bit hook had no effect on attention output"
+assert drift_2 > drift_4, f"2-bit drift ({drift_2}) should exceed 4-bit drift ({drift_4})"
+assert drift_after == 0.0, f"Hook leaked past context manager: drift {drift_after}"
+print("  hook activates, drift ordered, cleans up on exit ✓")
+# ===========================================================================
+# 6. Multi-layer hook installation
+# ===========================================================================
+hr("6. kv_quant_active_multi installs and tears down cleanly")
+# Build a tiny "model" with 3 attention modules
+class TinyModelShim:
+    """Stand-in for a HF model with model.layers[i].self_attn structure."""
+    def __init__(self, n=3):
+        # Match the discovery path: model.model.layers[i].self_attn
+        layers = []
+        for _ in range(n):
+            class Layer:
+                pass
+            layer = Layer()
+            layer.self_attn = TinyAttn()
+            layers.append(layer)
+        class Inner:
+            pass
+        self.model = Inner()
+        self.model.layers = layers
+m = TinyModelShim(n=3)
+attns = kvi.find_attention_modules(m)
+print(f"  Discovered {len(attns)} attention modules: layer indices {sorted(attns.keys())}")
+assert len(attns) == 3
+# Activate on layers 0 and 2 only, then verify only those have hooks during
+# the context and ALL are hookless afterward.
+specs = {
+    0: kvi.KVQuantSpec(k_bits=4, v_bits=4, quantizer="hqq_g64"),
+    2: kvi.KVQuantSpec(k_bits=2, v_bits=2, quantizer="hqq_g64"),
+}
+# Capture hook counts during context
+def count_hooks(attn):
+    return len(attn.k_proj._forward_hooks) + len(attn.v_proj._forward_hooks)
+before = {i: count_hooks(a) for i, a in attns.items()}
+with kvi.kv_quant_active_multi(attns, specs):
+    during = {i: count_hooks(a) for i, a in attns.items()}
+after = {i: count_hooks(a) for i, a in attns.items()}
+print(f"  hooks before:  {before}")
+print(f"  hooks during:  {during}")
+print(f"  hooks after:   {after}")
+assert during[0] == 2 and during[1] == 0 and during[2] == 2
+assert before == after == {0: 0, 1: 0, 2: 0}
+print("  multi-layer hook lifecycle clean ✓")
+print("\nAll assertions passed.")