harshithsaiv
/

kv-cache-compression

@@ -1,171 +1,51 @@
 """
 Per-Head Mixed-Precision KV Cache
-----------------------------------
-Quantizes each attention head's K and V tensors
-to either 4-bit or 8-bit based on calibrated sensitivity.
 """
 import torch
-import triton
-import triton.language as tl
-# ─── Triton Kernels ───────────────────────────────────────────────
-@triton.jit
-def quantize_8bit_kernel(
-    x_ptr, q_ptr, scale_ptr, zp_ptr,
-    N, BLOCK: tl.constexpr,
-):
-    pid  = tl.program_id(0)
-    offs = pid * BLOCK + tl.arange(0, BLOCK)
-    mask = offs < N
-    x = tl.load(x_ptr + offs, mask=mask, other=0.0).to(tl.float32)
-    x_min = tl.min(x, axis=0)
-    x_max = tl.max(x, axis=0)
-    scale = (x_max - x_min) / 255.0
-    scale = tl.where(scale < 1e-8, 1e-8, scale)
-    zp    = x_min
-    # round by adding 0.5 then casting
-    q = ((x - zp) / scale + 0.5).to(tl.int32)
-    q = tl.where(q < 0,   0,   q)
-    q = tl.where(q > 255, 255, q)
-    tl.store(q_ptr + offs, q.to(tl.int8), mask=mask)
-    if pid == 0:
-        tl.store(scale_ptr, scale)
-        tl.store(zp_ptr,    zp)
-@triton.jit
-def dequantize_8bit_kernel(
-    q_ptr, scale_ptr, zp_ptr, out_ptr,
-    N, BLOCK: tl.constexpr,
-):
-    pid  = tl.program_id(0)
-    offs = pid * BLOCK + tl.arange(0, BLOCK)
-    mask = offs < N
-    q     = tl.load(q_ptr    + offs, mask=mask, other=0).to(tl.float32)
-    scale = tl.load(scale_ptr).to(tl.float32)
-    zp    = tl.load(zp_ptr).to(tl.float32)
-    x = q * scale + zp
-    tl.store(out_ptr + offs, x.to(tl.float16), mask=mask)
-@triton.jit
-def quantize_4bit_kernel(
-    x_ptr, q_ptr, scale_ptr, zp_ptr,
-    N, BLOCK: tl.constexpr,
-):
-    pid      = tl.program_id(0)
-    offs_out = pid * BLOCK + tl.arange(0, BLOCK)
-    offs_in  = offs_out * 2
-    mask     = offs_in + 1 < N
-    x0 = tl.load(x_ptr + offs_in,     mask=mask, other=0.0).to(tl.float32)
-    x1 = tl.load(x_ptr + offs_in + 1, mask=mask, other=0.0).to(tl.float32)
-    x_min = tl.minimum(tl.min(x0, axis=0), tl.min(x1, axis=0))
-    x_max = tl.maximum(tl.max(x0, axis=0), tl.max(x1, axis=0))
-    scale  = (x_max - x_min) / 15.0
-    scale  = tl.where(scale < 1e-8, 1e-8, scale)
-    zp     = x_min
-    q0 = ((x0 - zp) / scale + 0.5).to(tl.int32)
-    q1 = ((x1 - zp) / scale + 0.5).to(tl.int32)
-    q0 = tl.where(q0 < 0, 0, tl.where(q0 > 15, 15, q0)).to(tl.int8)
-    q1 = tl.where(q1 < 0, 0, tl.where(q1 > 15, 15, q1)).to(tl.int8)
-    packed = q0 | (q1 << 4)
-    tl.store(q_ptr + offs_out, packed, mask=mask)
-    if pid == 0:
-        tl.store(scale_ptr, scale)
-        tl.store(zp_ptr,    zp)
-@triton.jit
-def dequantize_4bit_kernel(
-    q_ptr, scale_ptr, zp_ptr, out_ptr,
-    N, BLOCK: tl.constexpr,
-):
-    pid      = tl.program_id(0)
-    offs_out = pid * BLOCK + tl.arange(0, BLOCK)
-    offs_in  = offs_out * 2
-    mask     = offs_in + 1 < N
-    packed = tl.load(q_ptr + offs_out, mask=mask, other=0).to(tl.int8)
-    scale  = tl.load(scale_ptr).to(tl.float32)
-    zp     = tl.load(zp_ptr).to(tl.float32)
-    q0 = (packed & 0x0F).to(tl.float32)
-    q1 = ((packed >> 4) & 0x0F).to(tl.float32)
-    x0 = q0 * scale + zp
-    x1 = q1 * scale + zp
-    tl.store(out_ptr + offs_in,     x0.to(tl.float16), mask=mask)
-    tl.store(out_ptr + offs_in + 1, x1.to(tl.float16), mask=mask)
-# ─── Python Wrappers ──────────────────────────────────────────────
-BLOCK_SIZE = 1024
 def quantize_head(x: torch.Tensor, bits: int):
-    x = x.contiguous().to(torch.float16)
-    N = x.numel()
-    scale = torch.zeros(1, dtype=torch.float32, device=x.device)
-    zp    = torch.zeros(1, dtype=torch.float32, device=x.device)
     if bits == 8:
-        q    = torch.empty(N, dtype=torch.int8, device=x.device)
-        grid = (triton.cdiv(N, BLOCK_SIZE),)
-        quantize_8bit_kernel[grid](
-            x.view(-1), q, scale, zp, N, BLOCK=BLOCK_SIZE
-        )
     elif bits == 4:
-        assert N % 2 == 0
-        q    = torch.empty(N // 2, dtype=torch.int8, device=x.device)
-        grid = (triton.cdiv(N // 2, BLOCK_SIZE),)
-        quantize_4bit_kernel[grid](
-            x.view(-1), q, scale, zp, N, BLOCK=BLOCK_SIZE
-        )
     else:
         raise ValueError(f"Unsupported bits: {bits}")
     return q, scale, zp
 def dequantize_head(q, scale, zp, bits, original_shape):
-    if bits == 8:
-        N   = q.numel()
-        out = torch.empty(N, dtype=torch.float16, device=q.device)
-        grid = (triton.cdiv(N, BLOCK_SIZE),)
-        dequantize_8bit_kernel[grid](q, scale, zp, out, N, BLOCK=BLOCK_SIZE)
-    elif bits == 4:
-        N   = q.numel() * 2
-        out = torch.empty(N, dtype=torch.float16, device=q.device)
-        grid = (triton.cdiv(q.numel(), BLOCK_SIZE),)
-        dequantize_4bit_kernel[grid](q, scale, zp, out, N, BLOCK=BLOCK_SIZE)
-    else:
-        raise ValueError(f"Unsupported bits: {bits}")
-    return out.view(original_shape)
-# ─── Cache Manager ────────────────────────────────────────────────
 class MixedPrecisionKVCache:
     def __init__(self, bit_alloc: list):
         self.bit_alloc = bit_alloc
         self.k_cache   = []
         self.v_cache   = []
     def store(self, k: torch.Tensor, v: torch.Tensor):
         self.k_cache = []
         self.v_cache = []
         for h in range(k.shape[1]):
@@ -178,6 +58,7 @@ class MixedPrecisionKVCache:
             self.v_cache.append((vq, vs, vz, v_head.shape, bits))
     def retrieve(self):
         ks = [dequantize_head(q,s,z,b,sh) for q,s,z,sh,b in self.k_cache]
         vs = [dequantize_head(q,s,z,b,sh) for q,s,z,sh,b in self.v_cache]
         k  = torch.stack(ks, dim=0).unsqueeze(0)
@@ -185,10 +66,15 @@ class MixedPrecisionKVCache:
         return k, v
     def memory_bytes(self):
-        return sum(q.numel() + 8 for q,s,z,sh,b in self.k_cache + self.v_cache)
-# ─── Test ─────────────────────────────────────────────────────────
 if __name__ == "__main__":
     print("Testing MixedPrecisionKVCache...")
@@ -196,24 +82,43 @@ if __name__ == "__main__":
     k = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
     v = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
-    bit_alloc = [4, 8, 4, 8, 4, 8, 4, 8]
-    cache     = MixedPrecisionKVCache(bit_alloc)
     cache.store(k, v)
     k_out, v_out = cache.retrieve()
     k_err = (k - k_out).abs().mean().item()
     v_err = (v - v_out).abs().mean().item()
-    print(f"K reconstruction error: {k_err:.6f}")
-    print(f"V reconstruction error: {v_err:.6f}")
     fp16_bytes  = k.numel() * 2 * 2
     quant_bytes = cache.memory_bytes()
     print(f"\nFP16 memory:  {fp16_bytes/1024:.1f} KB")
     print(f"Quant memory: {quant_bytes/1024:.1f} KB")
     print(f"Compression:  {fp16_bytes/quant_bytes:.2f}x")
-    assert k_err < 0.1, f"K error too high: {k_err}"
-    assert v_err < 0.1, f"V error too high: {v_err}"
-    print("\n✅ All tests passed!")
-EOF

 """
 Per-Head Mixed-Precision KV Cache
+Using PyTorch for correctness, Triton optimization later.
 """
 import torch
+import json
+import os
 def quantize_head(x: torch.Tensor, bits: int):
+    """Quantize [seq, head_dim] tensor to given bits."""
+    x = x.float()
+    x_min = x.min()
+    x_max = x.max()
     if bits == 8:
+        qmax = 255.0
     elif bits == 4:
+        qmax = 15.0
     else:
         raise ValueError(f"Unsupported bits: {bits}")
+    scale = (x_max - x_min).clamp(min=1e-8) / qmax
+    zp    = x_min
+    q = ((x - zp) / scale).round().clamp(0, qmax).to(torch.uint8)
     return q, scale, zp
 def dequantize_head(q, scale, zp, bits, original_shape):
+    """Dequantize back to float16."""
+    x = q.float() * scale + zp
+    return x.to(torch.float16).view(original_shape)
 class MixedPrecisionKVCache:
+    """
+    Stores quantized K and V for all heads in one layer.
+    bit_alloc: list of ints, one per head (4 or 8)
+    """
     def __init__(self, bit_alloc: list):
         self.bit_alloc = bit_alloc
         self.k_cache   = []
         self.v_cache   = []
     def store(self, k: torch.Tensor, v: torch.Tensor):
+        """k, v: [batch, num_heads, seq, head_dim]"""
         self.k_cache = []
         self.v_cache = []
         for h in range(k.shape[1]):
             self.v_cache.append((vq, vs, vz, v_head.shape, bits))
     def retrieve(self):
+        """Dequantize all heads, return [1, heads, seq, head_dim] float16."""
         ks = [dequantize_head(q,s,z,b,sh) for q,s,z,sh,b in self.k_cache]
         vs = [dequantize_head(q,s,z,b,sh) for q,s,z,sh,b in self.v_cache]
         k  = torch.stack(ks, dim=0).unsqueeze(0)
         return k, v
     def memory_bytes(self):
+        total = 0
+        for (q, s, z, sh, bits) in self.k_cache + self.v_cache:
+            if bits == 4:
+                # 4-bit: 2 values per byte
+                total += q.numel() // 2 + 8
+            else:
+                total += q.numel() + 8
+        return total
 if __name__ == "__main__":
     print("Testing MixedPrecisionKVCache...")
     k = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
     v = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
+    # test 8-bit only first
+    print("\n--- 8-bit only ---")
+    bit_alloc = [8] * 8
+    cache = MixedPrecisionKVCache(bit_alloc)
     cache.store(k, v)
     k_out, v_out = cache.retrieve()
+    k_err = (k - k_out).abs().mean().item()
+    v_err = (v - v_out).abs().mean().item()
+    print(f"K error: {k_err:.6f}  V error: {v_err:.6f}")
+    assert k_err < 0.01, f"8-bit K error too high: {k_err}"
+    print("✅ 8-bit passed!")
+    # test 4-bit only
+    print("\n--- 4-bit only ---")
+    bit_alloc = [4] * 8
+    cache = MixedPrecisionKVCache(bit_alloc)
+    cache.store(k, v)
+    k_out, v_out = cache.retrieve()
+    k_err = (k - k_out).abs().mean().item()
+    v_err = (v - v_out).abs().mean().item()
+    print(f"K error: {k_err:.6f}  V error: {v_err:.6f}")
+    assert k_err < 0.1, f"4-bit K error too high: {k_err}"
+    print("✅ 4-bit passed!")
+    # test mixed
+    print("\n--- Mixed 4/8-bit ---")
+    bit_alloc = [4, 8, 4, 8, 4, 8, 4, 8]
+    cache = MixedPrecisionKVCache(bit_alloc)
+    cache.store(k, v)
+    k_out, v_out = cache.retrieve()
     k_err = (k - k_out).abs().mean().item()
     v_err = (v - v_out).abs().mean().item()
+    print(f"K error: {k_err:.6f}  V error: {v_err:.6f}")
     fp16_bytes  = k.numel() * 2 * 2
     quant_bytes = cache.memory_bytes()
     print(f"\nFP16 memory:  {fp16_bytes/1024:.1f} KB")
     print(f"Quant memory: {quant_bytes/1024:.1f} KB")
     print(f"Compression:  {fp16_bytes/quant_bytes:.2f}x")
+    print("\n✅ All tests passed!")