openbmb
/

MiniCPM4-8B

@@ -16,6 +16,7 @@
 import math
 import re
 import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
@@ -24,7 +25,7 @@ import torch.utils.checkpoint
 from torch import  nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache, CacheLayerMixin, DynamicLayer
 from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter,
     _prepare_4d_attention_mask,
@@ -233,6 +234,145 @@ class CompressK(torch.nn.Module):
         return compressed_k, cu_seqlens_compressed
 class InfLLMv2CacheLayer(DynamicLayer):
     def __init__(self):

 import math
 import re
 import warnings
+from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
 from torch import  nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter,
     _prepare_4d_attention_mask,
         return compressed_k, cu_seqlens_compressed
+class CacheLayerMixin(ABC):
+    """Base, abstract class for a single layer's cache."""
+    is_compileable = False
+    def __init__(self):
+        self.keys: torch.Tensor | None = None
+        self.values: torch.Tensor | None = None
+        self.is_initialized = False
+    def __repr__(self):
+        return f"{self.__class__.__name__}"
+    @abstractmethod
+    def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None: ...
+    @abstractmethod
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]: ...
+    @abstractmethod
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]: ...
+    @abstractmethod
+    def get_seq_length(self) -> int: ...
+    @abstractmethod
+    def get_max_cache_shape(self) -> int: ...
+    def offload(self):
+        """Offload this layer's data to CPU device."""
+        if self.is_initialized:
+            self.keys = self.keys.to("cpu", non_blocking=True)
+            self.values = self.values.to("cpu", non_blocking=True)
+    def prefetch(self):
+        """In case of layer offloading, this allows to move the data back to the layer's device ahead of time."""
+        if self.is_initialized and self.keys.device != self.device:
+            self.keys = self.keys.to(self.device, non_blocking=True)
+            self.values = self.values.to(self.device, non_blocking=True)
+    def reset(self) -> None:
+        """Resets the cache values while preserving the objects"""
+        if self.is_initialized:
+            self.keys.zero_()
+            self.values.zero_()
+        # This attribute is set on several Layers
+        if hasattr(self, "cumulative_length"):
+            # It can either be an int for dynamic layers, or a tensor for static layers
+            if isinstance(self.cumulative_length, int):
+                self.cumulative_length = 0
+            else:
+                self.cumulative_length.zero_()
+    def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
+        """Reorders this layer's cache for beam search."""
+        if self.get_seq_length() > 0:
+            self.keys = self.keys.index_select(0, beam_idx.to(self.keys.device))
+            self.values = self.values.index_select(0, beam_idx.to(self.values.device))
+class DynamicLayer(CacheLayerMixin):
+    """
+    A cache layer that grows dynamically as more tokens are generated. This is the default for generative models.
+    It stores the key and value states as tensors of shape `[batch_size, num_heads, seq_len, head_dim]`.
+    """
+    is_sliding = False
+    def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        self.dtype, self.device = key_states.dtype, key_states.device
+        self.keys = torch.tensor([], dtype=self.dtype, device=self.device)
+        self.values = torch.tensor([], dtype=self.dtype, device=self.device)
+        self.is_initialized = True
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Update the key and value caches in-place, and return the necessary keys and value states.
+        Args:
+            key_states (`torch.Tensor`): The new key states to cache.
+            value_states (`torch.Tensor`): The new value states to cache.
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`]: The key and value states.
+        """
+        # Lazy initialization
+        if not self.is_initialized:
+            self.lazy_initialization(key_states, value_states)
+        self.keys = torch.cat([self.keys, key_states], dim=-2)
+        self.values = torch.cat([self.values, value_states], dim=-2)
+        return self.keys, self.values
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]:
+        """Return the length and offset of the cache, used to generate the mask"""
+        kv_offset = 0
+        kv_length = self.get_seq_length() + query_length
+        return kv_length, kv_offset
+    def get_seq_length(self) -> int:
+        """Returns the sequence length of the cached states."""
+        if not self.is_initialized or self.keys.numel() == 0:
+            return 0
+        return self.keys.shape[-2]
+    def get_max_cache_shape(self) -> int:
+        """Returns the maximum sequence length of the cache object. DynamicLayer does not have a maximum length."""
+        return -1
+    def crop(self, max_length: int) -> None:
+        """
+        Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be negative
+        to remove `max_length` tokens.
+        """
+        if max_length < 0:
+            max_length = self.get_seq_length() - abs(max_length)
+        if self.get_seq_length() <= max_length:
+            return
+        self.keys = self.keys[..., :max_length, :]
+        self.values = self.values[..., :max_length, :]
+    def batch_repeat_interleave(self, repeats: int) -> None:
+        """Repeat the cache `repeats` times in the batch dimension."""
+        if self.get_seq_length() > 0:
+            self.keys = self.keys.repeat_interleave(repeats, dim=0)
+            self.values = self.values.repeat_interleave(repeats, dim=0)
+    def batch_select_indices(self, indices: torch.Tensor) -> None:
+        """Only keep the `indices` in the batch dimension of the cache."""
+        if self.get_seq_length() > 0:
+            self.keys = self.keys[indices, ...]
+            self.values = self.values[indices, ...]
 class InfLLMv2CacheLayer(DynamicLayer):
     def __init__(self):