File size: 4,003 Bytes

27871e7

"""
Explicit KV Cache management for efficient inference.
This is critical for Qualcomm deployment and agent control loops.
"""

import torch
from typing import Optional, Tuple
from dataclasses import dataclass


@dataclass
class KVCache:
    """Key-Value cache for transformer inference.

    Layout: [num_layers, batch_size, num_heads, max_seq_len, head_dim]

    This explicit cache enables:
    - Efficient autoregressive decoding
    - Cache offloading for memory management
    - Sliding window attention (future)
    - Agent control loops with cache manipulation
    """

    key_cache: torch.Tensor  # [num_layers, batch, heads, max_len, head_dim]
    value_cache: torch.Tensor  # [num_layers, batch, heads, max_len, head_dim]
    seq_len: int  # Current sequence length in cache

    @classmethod
    def create(
        cls,
        num_layers: int,
        batch_size: int,
        num_heads: int,
        max_seq_len: int,
        head_dim: int,
        dtype: torch.dtype = torch.float16,
        device: torch.device = None,
    ) -> "KVCache":
        """Create an empty KV cache.

        Args:
            num_layers: Number of transformer layers
            batch_size: Batch size
            num_heads: Number of attention heads
            max_seq_len: Maximum sequence length
            head_dim: Dimension per attention head
            dtype: Data type for cache tensors
            device: Device to create cache on

        Returns:
            Initialized KVCache with zero tensors
        """
        shape = (num_layers, batch_size, num_heads, max_seq_len, head_dim)

        key_cache = torch.zeros(shape, dtype=dtype, device=device)
        value_cache = torch.zeros(shape, dtype=dtype, device=device)

        return cls(key_cache=key_cache, value_cache=value_cache, seq_len=0)

    def update(
        self,
        layer_idx: int,
        key: torch.Tensor,
        value: torch.Tensor,
        position: int,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Update cache for a specific layer and return full K, V.

        Args:
            layer_idx: Index of the transformer layer
            key: New key tensor [batch, heads, seq_len, head_dim]
            value: New value tensor [batch, heads, seq_len, head_dim]
            position: Starting position for the new tokens

        Returns:
            Tuple of (full_key, full_value) including cached values
        """
        seq_len = key.shape[2]
        end_pos = position + seq_len

        # Store new keys and values
        self.key_cache[layer_idx, :, :, position:end_pos, :] = key
        self.value_cache[layer_idx, :, :, position:end_pos, :] = value

        # Update sequence length
        self.seq_len = max(self.seq_len, end_pos)

        # Return full K, V up to current position
        return (
            self.key_cache[layer_idx, :, :, :end_pos, :],
            self.value_cache[layer_idx, :, :, :end_pos, :],
        )

    def get(
        self,
        layer_idx: int,
        end_pos: Optional[int] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Get cached K, V for a specific layer.

        Args:
            layer_idx: Index of the transformer layer
            end_pos: End position (defaults to current seq_len)

        Returns:
            Tuple of (key, value) tensors
        """
        if end_pos is None:
            end_pos = self.seq_len

        return (
            self.key_cache[layer_idx, :, :, :end_pos, :],
            self.value_cache[layer_idx, :, :, :end_pos, :],
        )

    def reset(self):
        """Reset the cache to empty state."""
        self.key_cache.zero_()
        self.value_cache.zero_()
        self.seq_len = 0

    @property
    def memory_usage_mb(self) -> float:
        """Calculate memory usage in megabytes."""
        total_bytes = self.key_cache.numel() * self.key_cache.element_size()
        total_bytes += self.value_cache.numel() * self.value_cache.element_size()
        return total_bytes / (1024 * 1024)