transformers-community
/

sep_cache

@@ -1,221 +0,0 @@
-import torch
-import inspect
-import importlib
-from typing import Callable, Optional, Union, Any, List
-from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
-from transformers.cache_utils import Cache
-from transformers.processing_utils import Unpack
-from .sep_cache_utils import SepCache
-def truncate_input_ids_4_autoregression(input_ids, key_states):
-    if input_ids.shape[-1] != key_states.shape[-2]:
-        assert input_ids.shape[-1] >= key_states.shape[-2]
-        truncated_input_ids = input_ids[..., -key_states.shape[-2]: ]
-        return truncated_input_ids
-    else:
-        return input_ids
-def llama_atten_forward(
-    self,
-    hidden_states: torch.Tensor,
-    position_embeddings: tuple[torch.Tensor, torch.Tensor],
-    attention_mask: Optional[torch.Tensor],
-    past_key_value: Optional[Cache] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    **kwargs: Unpack[FlashAttentionKwargs],
-) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
-    input_shape = hidden_states.shape[:-1]
-    if hasattr(self, "head_dim"):
-        head_dim = self.head_dim
-    elif hasattr(self, "head_size"):
-        head_dim = self.head_size
-    hidden_shape = (*input_shape, -1, head_dim)
-    query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-    key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-    ###########################SepCache########################
-    assert isinstance(past_key_value,  SepCache), f"`past_key_value` must be of the type: `SepCache`."
-    APPLY_PE_SHIFT = past_key_value.APPLY_PE_SHIFT
-    APPLY_PES_INSIDE = past_key_value.APPLY_PES_INSIDE
-    ###########################################################
-    ########################Monkey Patching####################
-    module = importlib.import_module(self.__module__)
-    apply_rotary_pos_emb = module.apply_rotary_pos_emb
-    rotate_half = module.rotate_half
-    eager_attention_forward = module.eager_attention_forward
-    ALL_ATTENTION_FUNCTIONS = module.ALL_ATTENTION_FUNCTIONS
-    ###########################################################
-    if not APPLY_PE_SHIFT:
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-    if past_key_value is not None:
-        # ##################################################Default#########################################################
-        # sin and cos are specific to RoPE models; cache_position needed for the static cache
-        # cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-        # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # ##################################################################################################################
-        ##################################################SepCache#########################################################
-        # sin and cos are specific to RoPE models; position_ids needed for the static cache
-        if APPLY_PE_SHIFT and (not APPLY_PES_INSIDE):
-            ### At least the shifted `sin` and `cos` should be properly provided (not `None`).
-            cache_kwargs = {"sin": sin, "cos": cos, "cos_q": cos_q, "sin_q": sin_q, "cache_position": cache_position, "partial_rotation_size": None }
-        else:
-            cache_kwargs = {}
-        if "kwargs" in locals():
-            pass
-        elif "flash_attn_kwargs" in locals():
-            kwargs = flash_attn_kwargs
-        else:
-            raise NameError("`kwargs` or `flash_attn_kwargs` should be given and they need to contain `sepllm_kwargs` (which contains `input_ids`) and `position_ids`.")
-        if "input_ids" not in locals():
-            if "input_ids" in kwargs:
-                input_ids = kwargs.get("input_ids", None)
-            else:
-                sepllm_kwargs = kwargs.get("sepllm_kwargs", None)
-                assert sepllm_kwargs is not None, f"`sepllm_kwargs` must be provided when `input_ids` is not given."
-                input_ids = sepllm_kwargs.get("input_ids", None)
-            assert input_ids is not None, f"`input_ids` must be properly provided directly or through `sepllm_kwargs` when calling `update()` in `SepCache`."
-        if "position_ids" not in locals():
-            position_ids = kwargs.get("position_ids")
-        assert input_ids is not None, f"`input_ids` must be properly provided when calling `update()` in `SepCache`."
-        bsz, q_len, _ = hidden_states.size()
-        input_ids = truncate_input_ids_4_autoregression(input_ids = input_ids, key_states = key_states )
-        if APPLY_PE_SHIFT:
-            key_states, value_states, query_states = past_key_value.update(
-                key_states = key_states,
-                value_states = value_states,
-                query_states = query_states,
-                input_ids = input_ids,
-                layer_idx = self.layer_idx,
-                position_ids = position_ids,
-                PREFILLING_FLAG = q_len > 1,
-                cache_kwargs = cache_kwargs )
-        else:
-            key_states, value_states  =  past_key_value.update(
-                key_states = key_states,
-                value_states = value_states,
-                input_ids = input_ids,
-                layer_idx = self.layer_idx,
-                position_ids = position_ids,
-                PREFILLING_FLAG = q_len > 1,
-                cache_kwargs = cache_kwargs )
-        seq_len = past_key_value.get_usable_length(self.layer_idx)
-        if attention_mask is not None:
-            attention_mask = attention_mask[..., :seq_len]
-        ##################################################################################################################
-    attention_interface: Callable = eager_attention_forward
-    if self.config._attn_implementation != "eager":
-        attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-    attn_output, attn_weights = attention_interface(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        dropout=0.0 if not self.training else self.attention_dropout,
-        scaling=self.scaling,
-        **kwargs,
-    )
-    attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-    attn_output = self.o_proj(attn_output)
-    return attn_output, attn_weights
-def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
-    """Validates model kwargs for generation. Generate argument typos will also be caught here."""
-    # If a `Cache` instance is passed, checks whether the model is compatible with it
-    if isinstance(model_kwargs.get("past_key_values", None), Cache) and not self._supports_cache_class:
-        raise ValueError(
-            f"{self.__class__.__name__} does not support an instance of `Cache` as `past_key_values`. Please "
-            "check the model documentation for supported cache formats."
-        )
-    # Excludes arguments that are handled before calling any model function
-    if self.config.is_encoder_decoder:
-        for key in ["decoder_input_ids"]:
-            model_kwargs.pop(key, None)
-    unused_model_args = []
-    model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
-    # `kwargs`/`model_kwargs` is often used to handle optional forward pass inputs like `attention_mask`. If
-    # `prepare_inputs_for_generation` doesn't accept them, then a stricter check can be made ;)
-    if "kwargs" in model_args or "model_kwargs" in model_args:
-        model_args |= set(inspect.signature(self.forward).parameters)
-    # Encoder-Decoder models may also need Encoder arguments from `model_kwargs`
-    if self.config.is_encoder_decoder:
-        base_model = getattr(self, self.base_model_prefix, None)
-        # allow encoder kwargs
-        encoder = getattr(self, "encoder", None)
-        # `MusicgenForConditionalGeneration` has `text_encoder` and `audio_encoder`.
-        # Also, it has `base_model_prefix = "encoder_decoder"` but there is no `self.encoder_decoder`
-        # TODO: A better way to handle this.
-        if encoder is None and base_model is not None:
-            encoder = getattr(base_model, "encoder", None)
-        if encoder is not None:
-            encoder_model_args = set(inspect.signature(encoder.forward).parameters)
-            model_args |= encoder_model_args
-        # allow decoder kwargs
-        decoder = getattr(self, "decoder", None)
-        if decoder is None and base_model is not None:
-            decoder = getattr(base_model, "decoder", None)
-        if decoder is not None:
-            decoder_model_args = set(inspect.signature(decoder.forward).parameters)
-            model_args |= {f"decoder_{x}" for x in decoder_model_args}
-    for key, value in model_kwargs.items():
-        # #############################Default###########################
-        # if value is not None and key not in model_args:
-        #     unused_model_args.append(key)
-        # ###############################################################
-        ###############################SepCache###########################
-        if (value is not None) and (key not in model_args) and ("sep" not in str(key).lower()):
-            unused_model_args.append(key)
-        ###################################################################
-    if unused_model_args:
-        raise ValueError(
-            f"The following `model_kwargs` are not used by the model: {unused_model_args} (note: typos in the"
-            " generate arguments will also show up in this list)"
-        )