File size: 9,628 Bytes

f62ec09

# Copyright 2025 NVIDIA CORPORATION & AFFILIATES
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from typing import Any, Optional

import torch
import transformers

__all__ = ["JetCache"]


class JetNemotronCache(transformers.cache_utils.Cache):

    def __init__(
        self,
        seen_tokens: int = 0
    ) -> JetNemotronCache:

        self.states: list[dict[str, Any]] = []
        self.layer_wise_states: dict[str, Any] = {}

        self._base_seen_tokens = seen_tokens 
        self._seen_tokens = []  # Used in `generate` to keep tally of how many tokens the cache has seen

    def __getitem__(self, layer_idx: int) -> dict[str, Any]:
        if layer_idx < len(self):
            return self.states[layer_idx]
        else:
            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")

    def __iter__(self):
        for state in self.states:
            yield state

    def __len__(self):
        return len(self.states)

    def update(
        self,
        recurrent_state: torch.Tensor = None,
        attn_state: tuple[torch.Tensor, torch.Tensor] = None,
        conv_state: tuple[torch.Tensor] = None,
        ffn_state: torch.Tensor = None,
        layer_idx: int = 0,
        offset: Optional[int] = 1,
        increase_seen_tokens: bool = True,
        cache_kwargs: dict[str, Any] = {},
    ) -> dict[str, Any]:
        """
        Updates the cache with the new `recurrent_state`/`attn_state`/`conv_state` for the layer `layer_idx`.

        Args:
            recurrent_state (`torch.Tensor`, `optional`):
                The new recurrent state to cache.
            attn_state (`Tuple[torch.Tensor, torch.Tensor]`, `optional`):
                The new attention key/value states to cache.
            conv_state (`Tuple[torch.Tensor]`, `optional`):
                The new convolution state to cache.
            layer_idx (`int`, defaults to 0):
                The index of the layer to cache the states for.
            offset (`int`, `optional`, defaults to 1):
                The number of new tokens being processed.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass.

        Return:
            Dictionary of the updated state.
        """
        if len(self._seen_tokens) <= layer_idx:
            self._seen_tokens.append(self._base_seen_tokens)

        # Update the number of seen tokens
        if increase_seen_tokens:
            self.increase_seen_tokens(layer_idx, offset)
            
        if attn_state is not None:
            input_size = attn_state[0].shape[-2]
            window_size = cache_kwargs.get('window_size', None)
            if not isinstance(attn_state, tuple) or len(attn_state) != 2:
                raise ValueError("`attn_state` must be a tuple of two tensors for key/value states")
        if len(self.states) <= layer_idx:
            # in prefilling stage
            state = dict(
                recurrent_state=recurrent_state,
                attn_state=attn_state,
                conv_state=conv_state,
                ffn_state=ffn_state
            )
            if attn_state is not None and window_size is not None:
                # in prefilling stage, the cached and returned key/value states are different
                # original key/value states are returned, but the cached states are the last `window_size` tokens
                _key_state = attn_state[0][..., -window_size:, :]
                _value_state = attn_state[1][..., -window_size:, :]

                _attn_state = (_key_state, _value_state)
                _state = dict(
                    recurrent_state=recurrent_state,
                    attn_state=_attn_state,
                    conv_state=conv_state,
                    ffn_state=ffn_state
                )
                self.states.append(_state)
            else:
                self.states.append(state)
        else:
            state = self.states[layer_idx]
            if recurrent_state is not None:
                state['recurrent_state'] = recurrent_state
            if attn_state is not None:
                key_state, value_state = state['attn_state']
                assert window_size is None or key_state.shape[-2] <= window_size
                if window_size is not None and key_state.shape[-2] == window_size and input_size == 1:
                    # DO NOT allocate new memory if the cache is full
                    # only works in decoding stage
                    # roll the key/value states to the left by `input_size`
                                        
                    key_state = key_state.roll(-input_size, -2)
                    value_state = value_state.roll(-input_size, -2)
                                        
                    # replace the last `input_size` tokens with the new key/value states
                    key_state[..., -input_size:, :] = attn_state[0]
                    value_state[..., -input_size:, :] = attn_state[1]
                    
                    attn_state = (key_state, value_state)
                else:
                    # <= window_size or not sliding window or chunk-prefilling (input_size > 1)
                    attn_state = (torch.cat([key_state, attn_state[0]], -2),
                                  torch.cat([value_state, attn_state[1]], -2),)
                state['attn_state'] = attn_state
            if conv_state is not None:
                state['conv_state'] = conv_state
            if ffn_state is not None:
                state['ffn_state'] = ffn_state

        assert len(self.states) == len(self._seen_tokens)

        return state

    def trim_attn_state(self, layer_idx: int, window_size: int) -> None:
        # handle the case when the input length of SWA > 1 and has a cache, especially the chunk-prefilling case
        # this function is called after attention is donw
        assert layer_idx < len(self.states), f"Layer index {layer_idx} out of range for states with length {len(self.states)}"
        state = self.states[layer_idx]
        assert state["attn_state"] is not None, f"Layer {layer_idx} does not have an attention state"
        key_state, value_state = state["attn_state"]
        if key_state.shape[-2] > window_size:
            state["attn_state"] = (
                key_state[..., -window_size:, :],
                value_state[..., -window_size:, :],
            )

    def increase_seen_tokens(self, layer_idx: int, offset: int = 1) -> None:
        """Increases the number of seen tokens for the layer `layer_idx` by `offset`."""
        self._seen_tokens[layer_idx] += offset

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
        if len(self._seen_tokens) <= layer_idx:
            return self._base_seen_tokens
        return self._seen_tokens[layer_idx]

    def get_max_length(self) -> Optional[int]:
        """Returns the maximum sequence length of the cached states. Cache does not have a maximum length."""
        return None

    def to_legacy_cache(self) -> tuple:
        return tuple(self.states)

    def print_kv_sizes(self) -> None:
        """Returns the size of the cached key/value states."""
        for layer_idx, state in enumerate(self.states):
            if state.get("attn_state", None) is not None:
                key_state, value_state = state["attn_state"]
                # compute state size in MB
                key_size = key_state.element_size() * key_state.nelement() / (1024**2)
                value_size = value_state.element_size() * value_state.nelement() / (1024**2)
                print(key_state.shape, value_state.shape)
                print(f"Layer {layer_idx}: Attention. cache size: {key_size + value_size:.2f} MB")
            if state.get("conv_state", None) is not None:
                conv_state = state["conv_state"]
                # compute state size in MB
                conv_sizes = []
                for conv in conv_state:
                    conv_size = conv.element_size() * conv.nelement() / (1024**2)
                    conv_sizes.append(conv_size)
                conv_size = sum(conv_sizes)
                print(f"Layer {layer_idx}: Convolution. cache size: {conv_size:.2f} MB")
            if state.get("ffn_state", None) is not None:
                ffn_state = state["ffn_state"]
                # compute state size in MB
                ffn_size = ffn_state.element_size() * ffn_state.nelement() / (1024**2)
                print(f"Layer {layer_idx}: FFN. cache size: {ffn_size:.2f} MB")
            if state.get("recurrent_state", None) is not None:
                recurrent_state = state["recurrent_state"]
                # compute state size in MB
                recurrent_size = recurrent_state.element_size() * recurrent_state.nelement() / (1024**2)
                print(f"Layer {layer_idx}: Recurrent. cache size: {recurrent_size:.2f} MB")