harshithsaiv
/

kv-cache-compression

+"""
+Per-Head Mixed-Precision KV Cache
+----------------------------------
+Quantizes each attention head's K and V tensors
+to either 4-bit or 8-bit based on calibrated sensitivity.
+Layout per head:
+  - quantized data  (int8 tensor, packed for 4-bit)
+  - scale           (float16 scalar)
+  - zero_point      (float16 scalar)
+"""
+import torch
+import triton
+import triton.language as tl
+import json
+import os
+# ─── Triton Kernels ───────────────────────────────────────────────
+@triton.jit
+def quantize_8bit_kernel(
+    x_ptr,       # input  [seq, head_dim]
+    q_ptr,       # output [seq, head_dim] int8
+    scale_ptr,   # output scalar float32
+    zp_ptr,      # output scalar float32
+    N,           # total elements = seq * head_dim
+    BLOCK: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offs = pid * BLOCK + tl.arange(0, BLOCK)
+    mask = offs < N
+    x = tl.load(x_ptr + offs, mask=mask, other=0.0).to(tl.float32)
+    # compute scale and zero point from min/max
+    x_min = tl.min(x, axis=0)
+    x_max = tl.max(x, axis=0)
+    scale = (x_max - x_min) / 255.0
+    scale = tl.maximum(scale, 1e-8)
+    zp    = x_min
+    # quantize
+    q = tl.extra.libdevice.round((x - zp) / scale)
+    q = tl.minimum(tl.maximum(q, 0.0), 255.0)
+    tl.store(q_ptr  + offs, q.to(tl.int8), mask=mask)
+    # only first thread writes scale/zp
+    if pid == 0:
+        tl.store(scale_ptr, scale)
+        tl.store(zp_ptr,    zp)
+@triton.jit
+def dequantize_8bit_kernel(
+    q_ptr,       # input  [seq, head_dim] int8
+    scale_ptr,   # input  scalar
+    zp_ptr,      # input  scalar
+    out_ptr,     # output [seq, head_dim] float16
+    N,
+    BLOCK: tl.constexpr,
+):
+    pid  = tl.program_id(0)
+    offs = pid * BLOCK + tl.arange(0, BLOCK)
+    mask = offs < N
+    q     = tl.load(q_ptr   + offs, mask=mask, other=0).to(tl.float32)
+    scale = tl.load(scale_ptr).to(tl.float32)
+    zp    = tl.load(zp_ptr).to(tl.float32)
+    x = q * scale + zp
+    tl.store(out_ptr + offs, x.to(tl.float16), mask=mask)
+@triton.jit
+def quantize_4bit_kernel(
+    x_ptr,
+    q_ptr,       # output [seq, head_dim] int8 (2 values packed per byte)
+    scale_ptr,
+    zp_ptr,
+    N,           # total elements (must be even)
+    BLOCK: tl.constexpr,
+):
+    pid  = tl.program_id(0)
+    # each thread block handles BLOCK output bytes = BLOCK*2 input elements
+    offs_out = pid * BLOCK + tl.arange(0, BLOCK)
+    offs_in  = offs_out * 2
+    mask     = offs_in + 1 < N
+    x0 = tl.load(x_ptr + offs_in,     mask=mask, other=0.0).to(tl.float32)
+    x1 = tl.load(x_ptr + offs_in + 1, mask=mask, other=0.0).to(tl.float32)
+    # share scale across both elements
+    x_min = tl.minimum(tl.min(x0, axis=0), tl.min(x1, axis=0))
+    x_max = tl.maximum(tl.max(x0, axis=0), tl.max(x1, axis=0))
+    scale  = (x_max - x_min) / 15.0
+    scale  = tl.maximum(scale, 1e-8)
+    zp     = x_min
+    q0 = tl.extra.libdevice.round((x0 - zp) / scale)
+    q1 = tl.extra.libdevice.round((x1 - zp) / scale)
+    q0 = tl.minimum(tl.maximum(q0, 0.0), 15.0).to(tl.int8)
+    q1 = tl.minimum(tl.maximum(q1, 0.0), 15.0).to(tl.int8)
+    # pack two 4-bit values into one int8 byte
+    packed = q0 | (q1 << 4)
+    tl.store(q_ptr + offs_out, packed, mask=mask)
+    if pid == 0:
+        tl.store(scale_ptr, scale)
+        tl.store(zp_ptr,    zp)
+@triton.jit
+def dequantize_4bit_kernel(
+    q_ptr,
+    scale_ptr,
+    zp_ptr,
+    out_ptr,
+    N,
+    BLOCK: tl.constexpr,
+):
+    pid      = tl.program_id(0)
+    offs_out = pid * BLOCK + tl.arange(0, BLOCK)
+    offs_in  = offs_out * 2
+    mask     = offs_in + 1 < N
+    packed = tl.load(q_ptr + offs_out, mask=mask, other=0).to(tl.int8)
+    scale  = tl.load(scale_ptr).to(tl.float32)
+    zp     = tl.load(zp_ptr).to(tl.float32)
+    # unpack
+    q0 = (packed & 0x0F).to(tl.float32)
+    q1 = ((packed >> 4) & 0x0F).to(tl.float32)
+    x0 = q0 * scale + zp
+    x1 = q1 * scale + zp
+    tl.store(out_ptr + offs_in,     x0.to(tl.float16), mask=mask)
+    tl.store(out_ptr + offs_in + 1, x1.to(tl.float16), mask=mask)
+# ─── Python Wrappers ──────────────────────────────────────────────
+BLOCK_SIZE = 1024
+def quantize_head(x: torch.Tensor, bits: int):
+    """
+    Quantize a single head tensor using Triton kernel.
+    x: [seq_len, head_dim] float16
+    returns: (q, scale, zp)
+    """
+    x = x.contiguous()
+    N = x.numel()
+    scale = torch.zeros(1, dtype=torch.float32, device=x.device)
+    zp    = torch.zeros(1, dtype=torch.float32, device=x.device)
+    if bits == 8:
+        q = torch.empty(N, dtype=torch.int8, device=x.device)
+        grid = (triton.cdiv(N, BLOCK_SIZE),)
+        quantize_8bit_kernel[grid](
+            x.view(-1), q, scale, zp, N, BLOCK=BLOCK_SIZE
+        )
+    elif bits == 4:
+        assert N % 2 == 0, "head_dim must be even for 4-bit packing"
+        q = torch.empty(N // 2, dtype=torch.int8, device=x.device)
+        grid = (triton.cdiv(N // 2, BLOCK_SIZE),)
+        quantize_4bit_kernel[grid](
+            x.view(-1), q, scale, zp, N, BLOCK=BLOCK_SIZE
+        )
+    else:
+        raise ValueError(f"Unsupported bits: {bits}")
+    return q, scale, zp
+def dequantize_head(q: torch.Tensor, scale: torch.Tensor,
+                    zp: torch.Tensor, bits: int,
+                    original_shape: tuple) -> torch.Tensor:
+    """
+    Dequantize back to float16.
+    Returns tensor of original_shape in float16.
+    """
+    if bits == 8:
+        N = q.numel()
+        out = torch.empty(N, dtype=torch.float16, device=q.device)
+        grid = (triton.cdiv(N, BLOCK_SIZE),)
+        dequantize_8bit_kernel[grid](q, scale, zp, out, N, BLOCK=BLOCK_SIZE)
+    elif bits == 4:
+        N = q.numel() * 2
+        out = torch.empty(N, dtype=torch.float16, device=q.device)
+        grid = (triton.cdiv(q.numel(), BLOCK_SIZE),)
+        dequantize_4bit_kernel[grid](q, scale, zp, out, N, BLOCK=BLOCK_SIZE)
+    else:
+        raise ValueError(f"Unsupported bits: {bits}")
+    return out.view(original_shape)
+# ─── Per-Layer Cache Manager ──────────────────────────────────────
+class MixedPrecisionKVCache:
+    """
+    Stores quantized K and V for all heads in one layer.
+    bit_alloc: list of ints, one per head (4 or 8)
+    """
+    def __init__(self, bit_alloc: list):
+        self.bit_alloc = bit_alloc   # [num_heads]
+        self.k_cache   = []          # list of (q, scale, zp, shape)
+        self.v_cache   = []
+    def store(self, k: torch.Tensor, v: torch.Tensor):
+        """
+        k, v: [batch, num_heads, seq, head_dim]
+        Quantizes each head independently.
+        """
+        self.k_cache = []
+        self.v_cache = []
+        num_heads = k.shape[1]
+        for h in range(num_heads):
+            bits   = self.bit_alloc[h]
+            k_head = k[0, h]   # [seq, head_dim]
+            v_head = v[0, h]
+            kq, ks, kz = quantize_head(k_head, bits)
+            vq, vs, vz = quantize_head(v_head, bits)
+            self.k_cache.append((kq, ks, kz, k_head.shape, bits))
+            self.v_cache.append((vq, vs, vz, v_head.shape, bits))
+    def retrieve(self) -> tuple:
+        """
+        Dequantize all heads and reconstruct full K, V tensors.
+        Returns k, v: [1, num_heads, seq, head_dim] float16
+        """
+        ks, vs = [], []
+        for (kq, ksc, kzp, ksh, kb) in self.k_cache:
+            ks.append(dequantize_head(kq, ksc, kzp, kb, ksh))
+        for (vq, vsc, vzp, vsh, vb) in self.v_cache:
+            vs.append(dequantize_head(vq, vsc, vzp, vb, vsh))
+        k = torch.stack(ks, dim=0).unsqueeze(0)  # [1, heads, seq, head_dim]
+        v = torch.stack(vs, dim=0).unsqueeze(0)
+        return k, v
+    def memory_bytes(self) -> int:
+        """Estimate memory used by quantized cache."""
+        total = 0
+        for (q, s, z, shape, bits) in self.k_cache + self.v_cache:
+            total += q.numel() + 2 * 4  # data + scale + zp
+        return total
+# ─── Quick Correctness Test ───────────────────────────────────────
+if __name__ == "__main__":
+    print("Testing MixedPrecisionKVCache...")
+    # simulate one layer: batch=1, heads=8, seq=512, head_dim=128
+    torch.manual_seed(42)
+    k = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
+    v = torch.randn(1, 8, 512, 128, dtype=torch.float16, device="cuda")
+    # mixed allocation: alternating 4 and 8 bit
+    bit_alloc = [4, 8, 4, 8, 4, 8, 4, 8]
+    cache = MixedPrecisionKVCache(bit_alloc)
+    # store
+    cache.store(k, v)
+    # retrieve
+    k_out, v_out = cache.retrieve()
+    # correctness
+    k_err = (k - k_out).abs().mean().item()
+    v_err = (v - v_out).abs().mean().item()
+    print(f"K reconstruction error: {k_err:.6f}")
+    print(f"V reconstruction error: {v_err:.6f}")
+    # memory savings
+    fp16_bytes  = k.numel() * 2 * 2   # k + v, 2 bytes each
+    quant_bytes = cache.memory_bytes()
+    print(f"\nFP16 memory:  {fp16_bytes/1024:.1f} KB")
+    print(f"Quant memory: {quant_bytes/1024:.1f} KB")
+    print(f"Compression:  {fp16_bytes/quant_bytes:.2f}x")
+    # check errors are reasonable
+    assert k_err < 0.1, f"K error too high: {k_err}"
+    assert v_err < 0.1, f"V error too high: {v_err}"
+    print("\n✅ All tests passed!")
+EOF