harshithsaiv
/

kv-cache-compression

@@ -1,51 +1,40 @@
 """
 Per-Head Mixed-Precision KV Cache
 ----------------------------------
 Quantizes each attention head's K and V tensors
 to either 4-bit or 8-bit based on calibrated sensitivity.
-Layout per head:
-  - quantized data  (int8 tensor, packed for 4-bit)
-  - scale           (float16 scalar)
-  - zero_point      (float16 scalar)
 """
 import torch
 import triton
 import triton.language as tl
-import json
-import os
 # ─── Triton Kernels ───────────────────────────────────────────────
 @triton.jit
 def quantize_8bit_kernel(
-    x_ptr,       # input  [seq, head_dim]
-    q_ptr,       # output [seq, head_dim] int8
-    scale_ptr,   # output scalar float32
-    zp_ptr,      # output scalar float32
-    N,           # total elements = seq * head_dim
-    BLOCK: tl.constexpr,
 ):
-    pid = tl.program_id(0)
     offs = pid * BLOCK + tl.arange(0, BLOCK)
     mask = offs < N
     x = tl.load(x_ptr + offs, mask=mask, other=0.0).to(tl.float32)
-    # compute scale and zero point from min/max
     x_min = tl.min(x, axis=0)
     x_max = tl.max(x, axis=0)
     scale = (x_max - x_min) / 255.0
-    scale = tl.maximum(scale, 1e-8)
     zp    = x_min
-    # quantize
-    q = tl.extra.libdevice.round((x - zp) / scale)
-    q = tl.minimum(tl.maximum(q, 0.0), 255.0)
-    tl.store(q_ptr  + offs, q.to(tl.int8), mask=mask)
-    # only first thread writes scale/zp
     if pid == 0:
         tl.store(scale_ptr, scale)
         tl.store(zp_ptr,    zp)
@@ -53,18 +42,14 @@ def quantize_8bit_kernel(
 @triton.jit
 def dequantize_8bit_kernel(
-    q_ptr,       # input  [seq, head_dim] int8
-    scale_ptr,   # input  scalar
-    zp_ptr,      # input  scalar
-    out_ptr,     # output [seq, head_dim] float16
-    N,
-    BLOCK: tl.constexpr,
 ):
     pid  = tl.program_id(0)
     offs = pid * BLOCK + tl.arange(0, BLOCK)
     mask = offs < N
-    q     = tl.load(q_ptr   + offs, mask=mask, other=0).to(tl.float32)
     scale = tl.load(scale_ptr).to(tl.float32)
     zp    = tl.load(zp_ptr).to(tl.float32)
@@ -74,15 +59,10 @@ def dequantize_8bit_kernel(
 @triton.jit
 def quantize_4bit_kernel(
-    x_ptr,
-    q_ptr,       # output [seq, head_dim] int8 (2 values packed per byte)
-    scale_ptr,
-    zp_ptr,
-    N,           # total elements (must be even)
-    BLOCK: tl.constexpr,
 ):
-    pid  = tl.program_id(0)
-    # each thread block handles BLOCK output bytes = BLOCK*2 input elements
     offs_out = pid * BLOCK + tl.arange(0, BLOCK)
     offs_in  = offs_out * 2
     mask     = offs_in + 1 < N
@@ -90,22 +70,19 @@ def quantize_4bit_kernel(
     x0 = tl.load(x_ptr + offs_in,     mask=mask, other=0.0).to(tl.float32)
     x1 = tl.load(x_ptr + offs_in + 1, mask=mask, other=0.0).to(tl.float32)
-    # share scale across both elements
     x_min = tl.minimum(tl.min(x0, axis=0), tl.min(x1, axis=0))
     x_max = tl.maximum(tl.max(x0, axis=0), tl.max(x1, axis=0))
     scale  = (x_max - x_min) / 15.0
-    scale  = tl.maximum(scale, 1e-8)
     zp     = x_min
-    q0 = tl.extra.libdevice.round((x0 - zp) / scale)
-    q1 = tl.extra.libdevice.round((x1 - zp) / scale)
-    q0 = tl.minimum(tl.maximum(q0, 0.0), 15.0).to(tl.int8)
-    q1 = tl.minimum(tl.maximum(q1, 0.0), 15.0).to(tl.int8)
-    # pack two 4-bit values into one int8 byte
     packed = q0 | (q1 << 4)
     tl.store(q_ptr + offs_out, packed, mask=mask)
     if pid == 0:
         tl.store(scale_ptr, scale)
         tl.store(zp_ptr,    zp)
@@ -113,12 +90,8 @@ def quantize_4bit_kernel(
 @triton.jit
 def dequantize_4bit_kernel(
-    q_ptr,
-    scale_ptr,
-    zp_ptr,
-    out_ptr,
-    N,
-    BLOCK: tl.constexpr,
 ):
     pid      = tl.program_id(0)
     offs_out = pid * BLOCK + tl.arange(0, BLOCK)
@@ -129,7 +102,6 @@ def dequantize_4bit_kernel(
     scale  = tl.load(scale_ptr).to(tl.float32)
     zp     = tl.load(zp_ptr).to(tl.float32)
-    # unpack
     q0 = (packed & 0x0F).to(tl.float32)
     q1 = ((packed >> 4) & 0x0F).to(tl.float32)
@@ -145,26 +117,20 @@ def dequantize_4bit_kernel(
 BLOCK_SIZE = 1024
 def quantize_head(x: torch.Tensor, bits: int):
-    """
-    Quantize a single head tensor using Triton kernel.
-    x: [seq_len, head_dim] float16
-    returns: (q, scale, zp)
-    """
-    x = x.contiguous()
     N = x.numel()
     scale = torch.zeros(1, dtype=torch.float32, device=x.device)
     zp    = torch.zeros(1, dtype=torch.float32, device=x.device)
     if bits == 8:
-        q = torch.empty(N, dtype=torch.int8, device=x.device)
         grid = (triton.cdiv(N, BLOCK_SIZE),)
         quantize_8bit_kernel[grid](
             x.view(-1), q, scale, zp, N, BLOCK=BLOCK_SIZE
         )
     elif bits == 4:
-        assert N % 2 == 0, "head_dim must be even for 4-bit packing"
-        q = torch.empty(N // 2, dtype=torch.int8, device=x.device)
         grid = (triton.cdiv(N // 2, BLOCK_SIZE),)
         quantize_4bit_kernel[grid](
             x.view(-1), q, scale, zp, N, BLOCK=BLOCK_SIZE
@@ -175,20 +141,14 @@ def quantize_head(x: torch.Tensor, bits: int):
     return q, scale, zp
-def dequantize_head(q: torch.Tensor, scale: torch.Tensor,
-                    zp: torch.Tensor, bits: int,
-                    original_shape: tuple) -> torch.Tensor:
-    """
-    Dequantize back to float16.
-    Returns tensor of original_shape in float16.
-    """
     if bits == 8:
-        N = q.numel()
         out = torch.empty(N, dtype=torch.float16, device=q.device)
         grid = (triton.cdiv(N, BLOCK_SIZE),)
         dequantize_8bit_kernel[grid](q, scale, zp, out, N, BLOCK=BLOCK_SIZE)
     elif bits == 4:
-        N = q.numel() * 2
         out = torch.empty(N, dtype=torch.float16, device=q.device)
         grid = (triton.cdiv(q.numel(), BLOCK_SIZE),)
         dequantize_4bit_kernel[grid](q, scale, zp, out, N, BLOCK=BLOCK_SIZE)
@@ -198,96 +158,62 @@ def dequantize_head(q: torch.Tensor, scale: torch.Tensor,
     return out.view(original_shape)
-# ─── Per-Layer Cache Manager ──────────────────────────────────────
 class MixedPrecisionKVCache:
-    """
-    Stores quantized K and V for all heads in one layer.
-    bit_alloc: list of ints, one per head (4 or 8)
-    """
     def __init__(self, bit_alloc: list):
-        self.bit_alloc = bit_alloc   # [num_heads]
-        self.k_cache   = []          # list of (q, scale, zp, shape)
         self.v_cache   = []
     def store(self, k: torch.Tensor, v: torch.Tensor):
-        """
-        k, v: [batch, num_heads, seq, head_dim]
-        Quantizes each head independently.
-        """
         self.k_cache = []
         self.v_cache = []
-        num_heads = k.shape[1]
-        for h in range(num_heads):
             bits   = self.bit_alloc[h]
-            k_head = k[0, h]   # [seq, head_dim]
             v_head = v[0, h]
             kq, ks, kz = quantize_head(k_head, bits)
             vq, vs, vz = quantize_head(v_head, bits)
             self.k_cache.append((kq, ks, kz, k_head.shape, bits))
             self.v_cache.append((vq, vs, vz, v_head.shape, bits))
-    def retrieve(self) -> tuple:
-        """
-        Dequantize all heads and reconstruct full K, V tensors.
-        Returns k, v: [1, num_heads, seq, head_dim] float16
-        """
-        ks, vs = [], []
-        for (kq, ksc, kzp, ksh, kb) in self.k_cache:
-            ks.append(dequantize_head(kq, ksc, kzp, kb, ksh))
-        for (vq, vsc, vzp, vsh, vb) in self.v_cache:
-            vs.append(dequantize_head(vq, vsc, vzp, vb, vsh))
-        k = torch.stack(ks, dim=0).unsqueeze(0)  # [1, heads, seq, head_dim]
-        v = torch.stack(vs, dim=0).unsqueeze(0)
         return k, v
-    def memory_bytes(self) -> int:
-        """Estimate memory used by quantized cache."""
-        total = 0
-        for (q, s, z, shape, bits) in self.k_cache + self.v_cache:
-            total += q.numel() + 2 * 4  # data + scale + zp
-        return total
-# ─── Quick Correctness Test ───────────────────────────────────────
 if __name__ == "__main__":
     print("Testing MixedPrecisionKVCache...")
-    # simulate one layer: batch=1, heads=8, seq=512, head_dim=128
     torch.manual_seed(42)
     k = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
     v = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
-    # mixed allocation: alternating 4 and 8 bit
     bit_alloc = [4, 8, 4, 8, 4, 8, 4, 8]
-    cache = MixedPrecisionKVCache(bit_alloc)
-    # store
     cache.store(k, v)
-    # retrieve
     k_out, v_out = cache.retrieve()
-    # correctness
     k_err = (k - k_out).abs().mean().item()
     v_err = (v - v_out).abs().mean().item()
     print(f"K reconstruction error: {k_err:.6f}")
     print(f"V reconstruction error: {v_err:.6f}")
-    # memory savings
-    fp16_bytes  = k.numel() * 2 * 2   # k + v, 2 bytes each
     quant_bytes = cache.memory_bytes()
     print(f"\nFP16 memory:  {fp16_bytes/1024:.1f} KB")
     print(f"Quant memory: {quant_bytes/1024:.1f} KB")
     print(f"Compression:  {fp16_bytes/quant_bytes:.2f}x")
-    # check errors are reasonable
     assert k_err < 0.1, f"K error too high: {k_err}"
     assert v_err < 0.1, f"V error too high: {v_err}"
     print("\n✅ All tests passed!")

+cat > ~/kv-hack/kernel/quant_cache.py << 'EOF'
 """
 Per-Head Mixed-Precision KV Cache
 ----------------------------------
 Quantizes each attention head's K and V tensors
 to either 4-bit or 8-bit based on calibrated sensitivity.
 """
 import torch
 import triton
 import triton.language as tl
 # ─── Triton Kernels ───────────────────────────────────────────────
 @triton.jit
 def quantize_8bit_kernel(
+    x_ptr, q_ptr, scale_ptr, zp_ptr,
+    N, BLOCK: tl.constexpr,
 ):
+    pid  = tl.program_id(0)
     offs = pid * BLOCK + tl.arange(0, BLOCK)
     mask = offs < N
     x = tl.load(x_ptr + offs, mask=mask, other=0.0).to(tl.float32)
     x_min = tl.min(x, axis=0)
     x_max = tl.max(x, axis=0)
     scale = (x_max - x_min) / 255.0
+    scale = tl.where(scale < 1e-8, 1e-8, scale)
     zp    = x_min
+    # round by adding 0.5 then casting
+    q = ((x - zp) / scale + 0.5).to(tl.int32)
+    q = tl.where(q < 0,   0,   q)
+    q = tl.where(q > 255, 255, q)
+    tl.store(q_ptr + offs, q.to(tl.int8), mask=mask)
     if pid == 0:
         tl.store(scale_ptr, scale)
         tl.store(zp_ptr,    zp)
 @triton.jit
 def dequantize_8bit_kernel(
+    q_ptr, scale_ptr, zp_ptr, out_ptr,
+    N, BLOCK: tl.constexpr,
 ):
     pid  = tl.program_id(0)
     offs = pid * BLOCK + tl.arange(0, BLOCK)
     mask = offs < N
+    q     = tl.load(q_ptr    + offs, mask=mask, other=0).to(tl.float32)
     scale = tl.load(scale_ptr).to(tl.float32)
     zp    = tl.load(zp_ptr).to(tl.float32)
 @triton.jit
 def quantize_4bit_kernel(
+    x_ptr, q_ptr, scale_ptr, zp_ptr,
+    N, BLOCK: tl.constexpr,
 ):
+    pid      = tl.program_id(0)
     offs_out = pid * BLOCK + tl.arange(0, BLOCK)
     offs_in  = offs_out * 2
     mask     = offs_in + 1 < N
     x0 = tl.load(x_ptr + offs_in,     mask=mask, other=0.0).to(tl.float32)
     x1 = tl.load(x_ptr + offs_in + 1, mask=mask, other=0.0).to(tl.float32)
     x_min = tl.minimum(tl.min(x0, axis=0), tl.min(x1, axis=0))
     x_max = tl.maximum(tl.max(x0, axis=0), tl.max(x1, axis=0))
     scale  = (x_max - x_min) / 15.0
+    scale  = tl.where(scale < 1e-8, 1e-8, scale)
     zp     = x_min
+    q0 = ((x0 - zp) / scale + 0.5).to(tl.int32)
+    q1 = ((x1 - zp) / scale + 0.5).to(tl.int32)
+    q0 = tl.where(q0 < 0, 0, tl.where(q0 > 15, 15, q0)).to(tl.int8)
+    q1 = tl.where(q1 < 0, 0, tl.where(q1 > 15, 15, q1)).to(tl.int8)
     packed = q0 | (q1 << 4)
     tl.store(q_ptr + offs_out, packed, mask=mask)
     if pid == 0:
         tl.store(scale_ptr, scale)
         tl.store(zp_ptr,    zp)
 @triton.jit
 def dequantize_4bit_kernel(
+    q_ptr, scale_ptr, zp_ptr, out_ptr,
+    N, BLOCK: tl.constexpr,
 ):
     pid      = tl.program_id(0)
     offs_out = pid * BLOCK + tl.arange(0, BLOCK)
     scale  = tl.load(scale_ptr).to(tl.float32)
     zp     = tl.load(zp_ptr).to(tl.float32)
     q0 = (packed & 0x0F).to(tl.float32)
     q1 = ((packed >> 4) & 0x0F).to(tl.float32)
 BLOCK_SIZE = 1024
 def quantize_head(x: torch.Tensor, bits: int):
+    x = x.contiguous().to(torch.float16)
     N = x.numel()
     scale = torch.zeros(1, dtype=torch.float32, device=x.device)
     zp    = torch.zeros(1, dtype=torch.float32, device=x.device)
     if bits == 8:
+        q    = torch.empty(N, dtype=torch.int8, device=x.device)
         grid = (triton.cdiv(N, BLOCK_SIZE),)
         quantize_8bit_kernel[grid](
             x.view(-1), q, scale, zp, N, BLOCK=BLOCK_SIZE
         )
     elif bits == 4:
+        assert N % 2 == 0
+        q    = torch.empty(N // 2, dtype=torch.int8, device=x.device)
         grid = (triton.cdiv(N // 2, BLOCK_SIZE),)
         quantize_4bit_kernel[grid](
             x.view(-1), q, scale, zp, N, BLOCK=BLOCK_SIZE
     return q, scale, zp
+def dequantize_head(q, scale, zp, bits, original_shape):
     if bits == 8:
+        N   = q.numel()
         out = torch.empty(N, dtype=torch.float16, device=q.device)
         grid = (triton.cdiv(N, BLOCK_SIZE),)
         dequantize_8bit_kernel[grid](q, scale, zp, out, N, BLOCK=BLOCK_SIZE)
     elif bits == 4:
+        N   = q.numel() * 2
         out = torch.empty(N, dtype=torch.float16, device=q.device)
         grid = (triton.cdiv(q.numel(), BLOCK_SIZE),)
         dequantize_4bit_kernel[grid](q, scale, zp, out, N, BLOCK=BLOCK_SIZE)
     return out.view(original_shape)
+# ─── Cache Manager ────────────────────────────────────────────────
 class MixedPrecisionKVCache:
     def __init__(self, bit_alloc: list):
+        self.bit_alloc = bit_alloc
+        self.k_cache   = []
         self.v_cache   = []
     def store(self, k: torch.Tensor, v: torch.Tensor):
         self.k_cache = []
         self.v_cache = []
+        for h in range(k.shape[1]):
             bits   = self.bit_alloc[h]
+            k_head = k[0, h]
             v_head = v[0, h]
             kq, ks, kz = quantize_head(k_head, bits)
             vq, vs, vz = quantize_head(v_head, bits)
             self.k_cache.append((kq, ks, kz, k_head.shape, bits))
             self.v_cache.append((vq, vs, vz, v_head.shape, bits))
+    def retrieve(self):
+        ks = [dequantize_head(q,s,z,b,sh) for q,s,z,sh,b in self.k_cache]
+        vs = [dequantize_head(q,s,z,b,sh) for q,s,z,sh,b in self.v_cache]
+        k  = torch.stack(ks, dim=0).unsqueeze(0)
+        v  = torch.stack(vs, dim=0).unsqueeze(0)
         return k, v
+    def memory_bytes(self):
+        return sum(q.numel() + 8 for q,s,z,sh,b in self.k_cache + self.v_cache)
+# ─── Test ─────────────────────────────────────────────────────────
 if __name__ == "__main__":
     print("Testing MixedPrecisionKVCache...")
     torch.manual_seed(42)
     k = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
     v = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
     bit_alloc = [4, 8, 4, 8, 4, 8, 4, 8]
+    cache     = MixedPrecisionKVCache(bit_alloc)
     cache.store(k, v)
     k_out, v_out = cache.retrieve()
     k_err = (k - k_out).abs().mean().item()
     v_err = (v - v_out).abs().mean().item()
     print(f"K reconstruction error: {k_err:.6f}")
     print(f"V reconstruction error: {v_err:.6f}")
+    fp16_bytes  = k.numel() * 2 * 2
     quant_bytes = cache.memory_bytes()
     print(f"\nFP16 memory:  {fp16_bytes/1024:.1f} KB")
     print(f"Quant memory: {quant_bytes/1024:.1f} KB")
     print(f"Compression:  {fp16_bytes/quant_bytes:.2f}x")
     assert k_err < 0.1, f"K error too high: {k_err}"
     assert v_err < 0.1, f"V error too high: {v_err}"
     print("\n✅ All tests passed!")