Delete folder custom_generate/.ipynb_checkpoints with huggingface_hub

Browse files

Files changed (1) hide show

custom_generate/.ipynb_checkpoints/generate-checkpoint.py +0 -245

custom_generate/.ipynb_checkpoints/generate-checkpoint.py DELETED Viewed

@@ -1,245 +0,0 @@
-# Copyright 2025 China Merchants Bank. All rights reserved.
-#
-# Licensed under the MIT License (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://mit-license.org
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from transformers.cache_utils import DynamicCache
-from typing import Any, Dict, List, Optional, Tuple
-class LagKVCache(DynamicCache):
-    """
-    A KV compression algorithm that as described in the [LagKV paper](https://arxiv.org/abs/2504.04704).
-    The algorithm equips Sink Attention and SlidingWindow like SinkCache but with additional selective tokens in the middle.
-    It allows the model to generate with fewer memory resource and faster decoding speed.
-    The model will hold the main part of information retrieval capbility during the compression, compared to a completed loss
-    of the SinkCache.
-    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
-    `[batch_size, num_heads, seq_len, head_dim]`.
-    For the chunked prefilling, see https://github.com/AI-Lab-China-Merchants-Bank/LagKV.
-    Parameters:
-        _distributed_cache_data:
-            Inherited from DynamicCache.
-        ratio (`float`):
-            The retrain ratio of tokens in the middle chunks.
-        sink_size (`int`):
-            The number of sink tokens.
-        lag_size (`int`):
-            The size of the partition. The subsequent partion will serve as a reference for the prior one.
-        score_v_ratio (`float`):
-            The ratio multiplied to the score of Value states.
-        skip_layer_idx (`Optional[List[int]]`):
-            A list of layer indices will skip the compression.
-    Example:
-        ```python
-        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, LagKVCache
-        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-        >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt")
-        >>> # Prepare a cache class and pass it to model's forward
-        >>> past_key_values = LagKVCache(ratio=0.25, lag_size=128)
-        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
-        >>> outputs.past_key_values # access cache filled with key/values from generation
-        LagKVCache()
-        ```
-    """
-    def __init__(
-        self,
-        _distributed_cache_data=None,
-        ratio: float = 0.25,
-        sink_size: int = 16,
-        lag_size: int = 1024,
-        score_v_ratio: float = 1.0,
-        skip_layer_idx: Optional[List[int]] = None,
-    ):
-        super().__init__(_distributed_cache_data)
-        self.ratio = ratio
-        self.sink_size: int = sink_size
-        self.lag_size: int = lag_size
-        self.score_v_ratio: float = score_v_ratio
-        self.skip_layer_idx: List[int] = skip_layer_idx if skip_layer_idx is not None else []
-        self._compressed_len: List[int] = []
-    def update(
-        self,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        layer_idx: int,
-        cache_kwargs=None,
-    ):
-        """
-        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
-        Parameters:
-            key_states (`torch.Tensor`):
-                The new key states to cache.
-            value_states (`torch.Tensor`):
-                The new value states to cache.
-            layer_idx (`int`):
-                The index of the layer to cache the states for.
-            cache_kwargs (`Dict[str, Any]`, `optional`):
-                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
-        Return:
-            A tuple containing the updated key and value states.
-        """
-        # Update the number of seen tokens
-        if layer_idx == 0:
-            self._seen_tokens += key_states.shape[-2]
-        # Update the cache
-        if key_states is not None:
-            if len(self.key_cache) <= layer_idx:
-                # There may be skipped layers, fill them with empty lists
-                for _ in range(len(self.key_cache), layer_idx):
-                    self.key_cache.append([])
-                    self.value_cache.append([])
-                    self._compressed_len.append(self.sink_size)
-                self.key_cache.append(key_states)
-                self.value_cache.append(value_states)
-                self._compressed_len.append(self.sink_size)
-            elif (
-                len(self.key_cache[layer_idx]) == 0
-            ):  # fills previously skipped layers; checking for tensor causes errors
-                self.key_cache[layer_idx] = key_states
-                self.value_cache[layer_idx] = value_states
-            else:
-                self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
-                self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
-            if layer_idx not in self.skip_layer_idx:
-                return self._compress_kv_by_lag(layer_idx)
-        return self.key_cache[layer_idx], self.value_cache[layer_idx]
-    def _get_states_score(self, base_len, in_size, end_idx, value):
-        """Partition the states then calculate the state scores"""
-        # [batch_size, num_heads, seq_len, head_dim]
-        target_v = value[:, :, base_len:end_idx]
-        # [batch_size, num_heads, partition_num, lag_size, head_dim]
-        target_v = target_v.view(in_size[0], in_size[1], -1, self.lag_size, in_size[-1])
-        ref = target_v[:, :, 1:, :, :]
-        v = target_v[:, :, :-1, :, :]
-        min_r = ref.min(dim=-2).values.unsqueeze(-2).expand(-1, -1, -1, self.lag_size, -1)
-        max_r = ref.max(dim=-2).values.unsqueeze(-2).expand(-1, -1, -1, self.lag_size, -1)
-        score = ((v - min_r) / (max_r - min_r)).std(dim=-1).softmax(dim=-1)
-        return score
-    def _modify_kv(self, value, base_len, end_idx, selected_idx, tail_len):
-        # idx is offset by base_len
-        selected_value = torch.gather(value[:, :, base_len:end_idx], -2, selected_idx)
-        value = torch.cat((value[:, :, :base_len], selected_value, value[:, :, -tail_len:]), dim=-2)
-        return value
-    def _compress_algo(self, layer_idx, base_len):
-        """
-        Calculate the scores of KV tokens in each head and partition. See the paper.
-        The computation overhead of top-k is significantly reduced by partitioning.
-        """
-        in_size = self.key_cache[layer_idx].size()
-        end_idx = base_len + ((in_size[-2] - base_len) // self.lag_size) * self.lag_size
-        # [batch_size, num_heads, partition_num - 1, lag_size, head_dim]
-        key_score = self._get_states_score(base_len, in_size, end_idx, self.key_cache[layer_idx])
-        value_score = self._get_states_score(base_len, in_size, end_idx, self.value_cache[layer_idx])
-        score = key_score + value_score * self.score_v_ratio
-        # you may need to sort the index for some cases
-        selected_idx = torch.topk(score, int(self.ratio * self.lag_size), dim=-1).indices
-        for i in range(1, selected_idx.size()[2], 1):
-            selected_idx[:, :, i] += i * self.lag_size
-        selected_idx = selected_idx.reshape(in_size[0], in_size[1], -1).unsqueeze(-1).expand(-1, -1, -1, in_size[-1])
-        new_base_len = base_len + selected_idx.size()[-2]
-        # alwarys keep the last window
-        tail_len = self.lag_size + in_size[-2] - end_idx
-        self.key_cache[layer_idx] = self._modify_kv(
-            self.key_cache[layer_idx], base_len, end_idx, selected_idx, tail_len
-        )
-        self.value_cache[layer_idx] = self._modify_kv(
-            self.value_cache[layer_idx], base_len, end_idx, selected_idx, tail_len
-        )
-        self._compressed_len[layer_idx] = new_base_len
-    def _compress_kv_by_lag(self, layer_idx):
-        """the KV cache will be used then compressed"""
-        kv_size = self.key_cache[layer_idx].size()
-        base_len = self._compressed_len[layer_idx]
-        keys_to_return, values_to_return = self.key_cache[layer_idx], self.value_cache[layer_idx]
-        if kv_size[-2] >= base_len + 2 * self.lag_size:
-            self._compress_algo(layer_idx, base_len)
-        return keys_to_return, values_to_return
-def generate(model, lag_ratio=0.5, lag_sink_size=16, lag_size=128, **kwargs):
-    """Custom generate function for LagKVCache.
-    (template from https://huggingface.co/transformers-community/sink_cache)
-    Args:
-        model (`PreTrainedModel`):
-            The model to generate from.
-        lag_ratio (`float`):
-            The retrain ratio of tokens in the middle chunks.
-        lag_sink_size (`int`):
-            The number of sink tokens.
-        lag_size (`int`):
-            The size of the partition. See the original paper for more information.
-    """
-    # 1. General sanity checks
-    # 1.a. A few arguments are not allowed, especially arguments that control caches.
-    generation_config = kwargs.get("generation_config")
-    default_global_generation_config = GenerationConfig()
-    default_model_generation_config = model.generation_config
-    for arg in UNSUPPORTED_GENERATION_ARGS:
-        has_custom_gen_config_arg = (
-            generation_config is not None
-            # = and not (match global default or match model-specific default)
-            and not (
-                getattr(default_model_generation_config, arg) == getattr(generation_config, arg)
-                or getattr(default_global_generation_config, arg) == getattr(generation_config, arg)
-            )
-        )
-        kwargs_has_arg = arg in kwargs and kwargs[arg] is not None
-        if kwargs_has_arg or has_custom_gen_config_arg:
-            raise ValueError(
-                f"`{arg}` is set, but it's not supported in this custom generate function. List of "
-                f"unsupported arguments: {UNSUPPORTED_GENERATION_ARGS}"
-            )
-    # 1.b. The model must be decoder-only
-    if model.config.is_encoder_decoder:
-        raise ValueError("This custom generate function only works with decoder-only models")
-    # 1.c. compatibility with transformers 4.52: we must pop `custom_generate` from kwargs, otherwise it will result
-    # in an infinite loop when we call `model.generate`. This is solved in transformers 4.53.
-    kwargs.pop("custom_generate", None)
-    # 2. Generate with LagKVCache
-    # 2.a. prepare the cache, if it was not passed.
-    past_key_values = kwargs.pop("past_key_values", None)
-    if past_key_values is None:
-        past_key_values = LagKVCache(ratio=lag_ratio, sink_size=lag_sink_size, lag_size=lag_size)
-    elif not isinstance(past_key_values, LagKVCache):
-        raise ValueError(f"`past_key_values` must be a `LagKVCache` instance, got a {type(past_key_values)} instance")
-    # 2.b. generate with the cache
-    generation_outputs = model.generate(**kwargs, past_key_values=past_key_values, use_cache=True)
-    return generation_outputs