mxguru1
/

hsaq-tools

Model card Files Files and versions

xet

Community

mxguru1 commited on 14 days ago

Commit

48ec8ad

verified ·

1 Parent(s): 01426da

Add smoke_test_v4.py

Browse files

Files changed (1) hide show

smoke_test_v4.py +334 -0

smoke_test_v4.py ADDED Viewed

	@@ -0,0 +1,334 @@

+"""
+Smoke test for kv_profiler.
+Coverage:
+  1. SweepConfig and DEFAULT_SWEEP shape checks.
+  2. kv_bytes_per_token accounting — passthrough vs hqq_g64 sanity.
+  3. compute_drift returns zero for identical tensors, nonzero for different.
+  4. compute_calibration_hash is deterministic and distinguishes content.
+  5. End-to-end profile() on a tiny synthetic Llama-family model:
+     - Produces 11 × n_layers rows
+     - Drift is data-dependent (different per layer, non-zero, ordered)
+     - fp16_passthrough rows have drift ~0
+     - 2-bit configs have higher drift than 8-bit configs
+  6. rows_to_kv_candidates → assign_kv_bits round-trip.
+"""
+import sys
+import logging
+from collections import Counter
+from pathlib import Path
+from types import SimpleNamespace
+import torch
+import torch.nn as nn
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+import kv_intercept as kvi  # noqa
+import kv_profiler as kvp
+import assignment_v2 as asgn
+def hr(title):
+    print(f"\n{'=' * 6} {title} {'=' * 6}")
+logging.basicConfig(level=logging.WARNING)  # quiet for the test
+# ===========================================================================
+# 1. Sweep shape
+# ===========================================================================
+hr("1. DEFAULT_SWEEP shape")
+print(f"  total configs: {len(kvp.DEFAULT_SWEEP)}")
+assert len(kvp.DEFAULT_SWEEP) == 11, "Expected 11-config curated sweep"
+quants = Counter(c.quantizer for c in kvp.DEFAULT_SWEEP)
+print(f"  by quantizer: {dict(quants)}")
+assert quants["hqq_g64"] == 8
+assert quants["scaled_uniform"] == 2
+assert quants["scaled_per_head"] == 1
+# K-cheaper-than-V configs exist
+k_lt_v = [c for c in kvp.DEFAULT_SWEEP if c.k_bits < c.v_bits]
+print(f"  K<V configs: {len(k_lt_v)}")
+assert len(k_lt_v) == 4, "Expected 4 K-cheaper-than-V configs"
+print("  ✓")
+# ===========================================================================
+# 2. kv_bytes_per_token accounting
+# ===========================================================================
+hr("2. kv_bytes_per_token accounting")
+# fp16_passthrough: 8 heads × 128 dim × 2 bytes × 2 (K+V) = 4096 bytes
+bpt_fp16 = kvp.kv_bytes_per_token(8, 128, 16, 16, "fp16_passthrough")
+print(f"  fp16_passthrough (8h × 128d): {bpt_fp16} bytes")
+assert bpt_fp16 == 8 * 128 * 2 * 2
+# hqq_g64 at 4/4: ~half of fp16 plus overhead
+bpt_44 = kvp.kv_bytes_per_token(8, 128, 4, 4, "hqq_g64")
+print(f"  hqq_g64 4/4:               {bpt_44} bytes")
+# 8 heads × 128 dim × 4 bits / 8 = 512 bytes payload per K, same per V → 1024
+# Plus overhead per K: 8 heads × (128/64 groups) × 4 bytes = 64 bytes; ×2 (K+V) = 128
+# Total: 1024 + 128 = 1152 bytes
+assert bpt_44 == 1024 + 128
+# 2-bit asymmetric should be cheaper than symmetric 4-bit
+bpt_24 = kvp.kv_bytes_per_token(8, 128, 2, 4, "hqq_g64")
+print(f"  hqq_g64 2/4:               {bpt_24} bytes")
+assert bpt_24 < bpt_44
+print("  ✓")
+# ===========================================================================
+# 3. compute_drift
+# ===========================================================================
+hr("3. compute_drift")
+a = torch.randn(2, 4, 8)
+print(f"  identical tensors, mse_normalised: {kvp.compute_drift(a, a, 'mse_normalised'):.6f}")
+assert kvp.compute_drift(a, a, "mse_normalised") == 0.0
+b = a + 0.1 * torch.randn_like(a)
+d = kvp.compute_drift(b, a, "mse_normalised")
+print(f"  perturbed by 0.1×noise:            {d:.6f}")
+assert d > 0
+print("  ✓")
+# ===========================================================================
+# 4. compute_calibration_hash determinism
+# ===========================================================================
+hr("4. compute_calibration_hash")
+texts1 = ["hello world", "the quick brown fox"]
+texts2 = ["hello world", "the quick brown fox"]
+texts3 = ["hello world", "different text"]
+h1 = kvp.compute_calibration_hash(texts1, 512)
+h2 = kvp.compute_calibration_hash(texts2, 512)
+h3 = kvp.compute_calibration_hash(texts3, 512)
+print(f"  same content:      h1={h1}  h2={h2}")
+print(f"  different content: h3={h3}")
+assert h1 == h2, "Identical inputs should hash the same"
+assert h1 != h3, "Different inputs should hash differently"
+print("  ✓")
+# ===========================================================================
+# 5. End-to-end profiling on a synthetic Llama-family model
+# ===========================================================================
+hr("5. profile_kv_sensitivity end-to-end")
+class TinyAttn(nn.Module):
+    """Mimics Llama-family self_attn (k_proj, v_proj on .self_attn)."""
+    def __init__(self, hidden=128, num_heads=4, num_kv_heads=4):
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = hidden // num_heads
+        self.q_proj = nn.Linear(hidden, num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(hidden, num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(hidden, num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(num_heads * self.head_dim, hidden, bias=False)
+    def forward(self, x):
+        b, s, _ = x.shape
+        q = self.q_proj(x).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(b, s, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(b, s, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        attn = torch.softmax(q @ k.transpose(-2, -1) / (self.head_dim ** 0.5), dim=-1)
+        out = (attn @ v).transpose(1, 2).reshape(b, s, -1)
+        return self.o_proj(out)
+class TinyModel(nn.Module):
+    """HF-shape stand-in: model.model.layers[i].self_attn, with .config and
+    a forward that accepts input_ids."""
+    def __init__(self, n_layers=3, hidden=128, num_heads=4, vocab=64):
+        super().__init__()
+        self.embed = nn.Embedding(vocab, hidden)
+        class Inner(nn.Module):
+            def __init__(self):
+                super().__init__()
+        self.model = Inner()
+        self.model.layers = nn.ModuleList()
+        for _ in range(n_layers):
+            layer = nn.Module()
+            layer.self_attn = TinyAttn(hidden=hidden, num_heads=num_heads,
+                                       num_kv_heads=num_heads)
+            self.model.layers.append(layer)
+        self.config = SimpleNamespace(
+            num_attention_heads=num_heads,
+            num_key_value_heads=num_heads,
+            hidden_size=hidden,
+        )
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def forward(self, input_ids=None, attention_mask=None, use_cache=False, **kw):
+        x = self.embed(input_ids)
+        for layer in self.model.layers:
+            x = x + layer.self_attn(x)
+        return x
+class TinyTokenizer:
+    """Just enough tokenizer surface for the profiler."""
+    def __init__(self, vocab=64):
+        self.vocab = vocab
+    def __call__(self, texts, return_tensors=None, padding=None,
+                 truncation=None, max_length=None):
+        torch.manual_seed(0)  # deterministic across calls for test stability
+        ids = [torch.randint(0, self.vocab, (min(len(t), max_length or 32),)) for t in texts]
+        max_len = max(t.shape[0] for t in ids)
+        padded = torch.zeros(len(texts), max_len, dtype=torch.long)
+        mask = torch.zeros(len(texts), max_len, dtype=torch.long)
+        for i, t in enumerate(ids):
+            padded[i, :t.shape[0]] = t
+            mask[i, :t.shape[0]] = 1
+        return SimpleNamespace(
+            input_ids=padded,
+            attention_mask=mask,
+            to=lambda device: SimpleNamespace(input_ids=padded.to(device),
+                                              attention_mask=mask.to(device)),
+        )
+torch.manual_seed(42)
+model = TinyModel(n_layers=3, hidden=128, num_heads=4)
+model.eval()
+tokenizer = TinyTokenizer()
+# Wrap the tokenizer output so .to() returns a kwargs-compatible dict
+class TokenizerWrapper:
+    def __init__(self, tk):
+        self.tk = tk
+    def __call__(self, texts, **kw):
+        result = self.tk(texts, **kw)
+        # Make it dict-unpack-friendly
+        result_dict = {"input_ids": result.input_ids, "attention_mask": result.attention_mask}
+        result_dict_obj = SimpleNamespace(**result_dict)
+        # Need .to() to return something dict-unpack-friendly too
+        def to(device):
+            d = {"input_ids": result_dict["input_ids"].to(device),
+                 "attention_mask": result_dict["attention_mask"].to(device)}
+            # Use a small class that supports both **kwargs unpacking and .input_ids
+            class B:
+                def __init__(self, d):
+                    self.__dict__.update(d)
+                    self._d = d
+                def keys(self): return self._d.keys()
+                def __getitem__(self, k): return self._d[k]
+            return B(d)
+        result_dict_obj.to = to
+        return result_dict_obj
+wrapped_tok = TokenizerWrapper(tokenizer)
+calibration_texts = [
+    "the quick brown fox jumps over the lazy dog",
+    "machine learning models compress activations",
+    "key value caches grow with context length",
+    "attention is all you need",
+] * 4  # 16 samples
+rows = kvp.profile_kv_sensitivity(
+    model=model,
+    tokenizer=wrapped_tok,
+    calibration_texts=calibration_texts,
+    model_hash="testmodel" + "0" * 8,
+    profiled_by_agent_id="smoke-test",
+    profiled_by_agent_tier=0,
+    max_seq_len=32,
+    drift_metric="mse_normalised",
+    progress_cb=lambda m: None,  # silent
+)
+print(f"  emitted rows: {len(rows)}")
+# 11 configs × 3 layers = 33
+assert len(rows) == 33, f"Expected 33 rows, got {len(rows)}"
+# fp16_passthrough not in default sweep, but let's check that 8-bit < 2-bit drift
+by_config = {}
+for r in rows:
+    key = (r.k_bits, r.v_bits, r.quantizer)
+    by_config.setdefault(key, []).append(r.drift_attn_output)
+# Average drift per config across layers
+avg_drift = {k: sum(v) / len(v) for k, v in by_config.items()}
+print(f"  avg drift (8,8) hqq_g64:   {avg_drift[(8, 8, 'hqq_g64')]:.4e}")
+print(f"  avg drift (4,4) hqq_g64:   {avg_drift[(4, 4, 'hqq_g64')]:.4e}")
+print(f"  avg drift (3,3) hqq_g64:   {avg_drift[(3, 3, 'hqq_g64')]:.4e}")
+print(f"  avg drift (2,2) hqq_g64:   {avg_drift[(2, 2, 'hqq_g64')]:.4e}")
+print(f"  avg drift (2,4) hqq_g64:   {avg_drift[(2, 4, 'hqq_g64')]:.4e}")
+# Sanity: more bits = less drift for the symmetric chain
+assert avg_drift[(8, 8, "hqq_g64")] < avg_drift[(4, 4, "hqq_g64")]
+assert avg_drift[(4, 4, "hqq_g64")] < avg_drift[(3, 3, "hqq_g64")]
+assert avg_drift[(3, 3, "hqq_g64")] < avg_drift[(2, 2, "hqq_g64")]
+print("  bit ordering 8<4<3<2 verified across symmetric configs ✓")
+# K-cheaper helps: (4,4) should be cheaper drift than (2,4) but (2,4) should
+# be cheaper than (2,2) — K matters more than V
+assert avg_drift[(2, 4, "hqq_g64")] < avg_drift[(2, 2, "hqq_g64")]
+print("  (2,4) < (2,2) — V-precision helps even when K is aggressive ✓")
+# Drift is per-layer (not all identical — would indicate a stuck hook)
+sample_config = (4, 4, "hqq_g64")
+layer_drifts = sorted(by_config[sample_config])
+print(f"  (4,4) drifts per layer: {[f'{d:.4e}' for d in layer_drifts]}")
+unique_drifts = len(set(round(d, 10) for d in layer_drifts))
+assert unique_drifts >= 1
+print(f"  per-layer drift variation: {unique_drifts} distinct values")
+# ===========================================================================
+# 6. Bridge to allocator
+# ===========================================================================
+hr("6. rows_to_kv_candidates → assign_kv_bits round-trip")
+candidates = kvp.rows_to_kv_candidates(rows)
+print(f"  built {len(candidates)} KVCandidates (expected 3 = n_layers)")
+assert len(candidates) == 3
+# Each candidate carries the full 11 options
+for cand in candidates:
+    assert len(cand.options) == 11, f"Layer {cand.layer_idx}: expected 11 options, got {len(cand.options)}"
+    assert cand.num_kv_heads == 4 and cand.head_dim == 32
+# Run the allocator with a budget that forces variation
+# All-cheapest = (2,4) at ~bpt_24 bytes/token × 32 seq × 3 layers
+# All-most-expensive (8,8) ≈ bpt_88 × 32 × 3
+bpt_24 = kvp.kv_bytes_per_token(4, 32, 2, 4, "hqq_g64")
+bpt_88 = kvp.kv_bytes_per_token(4, 32, 8, 8, "hqq_g64")
+all_cheap_bytes = bpt_24 * 32 * 3
+all_expensive_bytes = bpt_88 * 32 * 3
+budget_bytes = (all_cheap_bytes + all_expensive_bytes) / 2
+print(f"  cheapest config: {all_cheap_bytes:.0f} bytes total")
+print(f"  priciest config: {all_expensive_bytes:.0f} bytes total")
+print(f"  budget chosen:   {budget_bytes:.0f} bytes (midpoint)")
+result = asgn.assign_kv_bits(
+    candidates,
+    kv_budget_gb=budget_bytes / 1e9,
+    max_seq_len=32,
+)
+print(f"  KV used: {result.total_kv_gb * 1e9:.0f} bytes / {budget_bytes:.0f} budget")
+print(f"  saturated: {result.saturated}")
+chosen_dist = Counter((a.chosen.k_bits, a.chosen.v_bits, a.chosen.quantizer)
+                       for a in result.assignments)
+print(f"  chosen configs: {dict(chosen_dist)}")
+assert result.total_kv_gb * 1e9 <= budget_bytes
+print("  allocator consumed profiler output cleanly ✓")
+print("\nAll assertions passed.")