sep_cache / functions_2_patch.py

Upload 3 files

61d7855 verified 7 months ago

9.98 kB

	import torch
	import inspect
	import importlib

	from typing import Callable, Optional, Union, Any, List
	from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
	from transformers.cache_utils import Cache
	from transformers.processing_utils import Unpack

	from .sep_cache_utils import SepCache



	def truncate_input_ids_4_autoregression(input_ids, key_states):
	if input_ids.shape[-1] != key_states.shape[-2]:
	assert input_ids.shape[-1] >= key_states.shape[-2]
	truncated_input_ids = input_ids[..., -key_states.shape[-2]: ]
	return truncated_input_ids
	else:
	return input_ids

	def llama_atten_forward(
	self,
	hidden_states: torch.Tensor,
	position_embeddings: tuple[torch.Tensor, torch.Tensor],
	attention_mask: Optional[torch.Tensor],
	past_key_value: Optional[Cache] = None,
	cache_position: Optional[torch.LongTensor] = None,
	**kwargs: Unpack[FlashAttentionKwargs],
	) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
	input_shape = hidden_states.shape[:-1]

	if hasattr(self, "head_dim"):
	head_dim = self.head_dim
	elif hasattr(self, "head_size"):
	head_dim = self.head_size

	hidden_shape = (*input_shape, -1, head_dim)

	query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
	key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
	value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)


	###########################SepCache########################
	assert isinstance(past_key_value, SepCache), f"`past_key_value` must be of the type: `SepCache`."
	APPLY_PE_SHIFT = past_key_value.APPLY_PE_SHIFT
	APPLY_PES_INSIDE = past_key_value.APPLY_PES_INSIDE
	###########################################################


	########################Monkey Patching####################
	module = importlib.import_module(self.__module__)

	apply_rotary_pos_emb = module.apply_rotary_pos_emb
	rotate_half = module.rotate_half
	eager_attention_forward = module.eager_attention_forward
	ALL_ATTENTION_FUNCTIONS = module.ALL_ATTENTION_FUNCTIONS
	###########################################################

	if not APPLY_PE_SHIFT:
	cos, sin = position_embeddings
	query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

	if past_key_value is not None:
	# ##################################################Default#########################################################
	# sin and cos are specific to RoPE models; cache_position needed for the static cache
	# cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
	# key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
	# ##################################################################################################################

	##################################################SepCache#########################################################
	# sin and cos are specific to RoPE models; position_ids needed for the static cache
	if APPLY_PE_SHIFT and (not APPLY_PES_INSIDE):
	### At least the shifted `sin` and `cos` should be properly provided (not `None`).
	cache_kwargs = {"sin": sin, "cos": cos, "cos_q": cos_q, "sin_q": sin_q, "cache_position": cache_position, "partial_rotation_size": None }
	else:
	cache_kwargs = {}


	if "kwargs" in locals():
	pass
	elif "flash_attn_kwargs" in locals():
	kwargs = flash_attn_kwargs
	else:
	raise NameError("`kwargs` or `flash_attn_kwargs` should be given and they need to contain `sepllm_kwargs` (which contains `input_ids`) and `position_ids`.")

	if "input_ids" not in locals():
	if "input_ids" in kwargs:
	input_ids = kwargs.get("input_ids", None)
	else:
	sepllm_kwargs = kwargs.get("sepllm_kwargs", None)
	assert sepllm_kwargs is not None, f"`sepllm_kwargs` must be provided when `input_ids` is not given."
	input_ids = sepllm_kwargs.get("input_ids", None)

	assert input_ids is not None, f"`input_ids` must be properly provided directly or through `sepllm_kwargs` when calling `update()` in `SepCache`."

	if "position_ids" not in locals():
	position_ids = kwargs.get("position_ids")

	assert input_ids is not None, f"`input_ids` must be properly provided when calling `update()` in `SepCache`."
	bsz, q_len, _ = hidden_states.size()

	input_ids = truncate_input_ids_4_autoregression(input_ids = input_ids, key_states = key_states )

	if APPLY_PE_SHIFT:
	key_states, value_states, query_states = past_key_value.update(
	key_states = key_states,
	value_states = value_states,
	query_states = query_states,
	input_ids = input_ids,
	layer_idx = self.layer_idx,
	position_ids = position_ids,
	PREFILLING_FLAG = q_len > 1,
	cache_kwargs = cache_kwargs )

	else:
	key_states, value_states = past_key_value.update(
	key_states = key_states,
	value_states = value_states,
	input_ids = input_ids,
	layer_idx = self.layer_idx,
	position_ids = position_ids,
	PREFILLING_FLAG = q_len > 1,
	cache_kwargs = cache_kwargs )

	seq_len = past_key_value.get_usable_length(self.layer_idx)

	if attention_mask is not None:
	attention_mask = attention_mask[..., :seq_len]
	##################################################################################################################


	attention_interface: Callable = eager_attention_forward
	if self.config._attn_implementation != "eager":
	attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

	attn_output, attn_weights = attention_interface(
	self,
	query_states,
	key_states,
	value_states,
	attention_mask,
	dropout=0.0 if not self.training else self.attention_dropout,
	scaling=self.scaling,
	**kwargs,
	)

	attn_output = attn_output.reshape(*input_shape, -1).contiguous()
	attn_output = self.o_proj(attn_output)
	return attn_output, attn_weights






	def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
	"""Validates model kwargs for generation. Generate argument typos will also be caught here."""
	# If a `Cache` instance is passed, checks whether the model is compatible with it
	if isinstance(model_kwargs.get("past_key_values", None), Cache) and not self._supports_cache_class:
	raise ValueError(
	f"{self.__class__.__name__} does not support an instance of `Cache` as `past_key_values`. Please "
	"check the model documentation for supported cache formats."
	)

	# Excludes arguments that are handled before calling any model function
	if self.config.is_encoder_decoder:
	for key in ["decoder_input_ids"]:
	model_kwargs.pop(key, None)

	unused_model_args = []
	model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
	# `kwargs`/`model_kwargs` is often used to handle optional forward pass inputs like `attention_mask`. If
	# `prepare_inputs_for_generation` doesn't accept them, then a stricter check can be made ;)
	if "kwargs" in model_args or "model_kwargs" in model_args:
	model_args \|= set(inspect.signature(self.forward).parameters)

	# Encoder-Decoder models may also need Encoder arguments from `model_kwargs`
	if self.config.is_encoder_decoder:
	base_model = getattr(self, self.base_model_prefix, None)

	# allow encoder kwargs
	encoder = getattr(self, "encoder", None)
	# `MusicgenForConditionalGeneration` has `text_encoder` and `audio_encoder`.
	# Also, it has `base_model_prefix = "encoder_decoder"` but there is no `self.encoder_decoder`
	# TODO: A better way to handle this.
	if encoder is None and base_model is not None:
	encoder = getattr(base_model, "encoder", None)

	if encoder is not None:
	encoder_model_args = set(inspect.signature(encoder.forward).parameters)
	model_args \|= encoder_model_args

	# allow decoder kwargs
	decoder = getattr(self, "decoder", None)
	if decoder is None and base_model is not None:
	decoder = getattr(base_model, "decoder", None)

	if decoder is not None:
	decoder_model_args = set(inspect.signature(decoder.forward).parameters)
	model_args \|= {f"decoder_{x}" for x in decoder_model_args}

	for key, value in model_kwargs.items():
	# #############################Default###########################
	# if value is not None and key not in model_args:
	# unused_model_args.append(key)
	# ###############################################################

	###############################SepCache###########################
	if (value is not None) and (key not in model_args) and ("sep" not in str(key).lower()):
	unused_model_args.append(key)
	###################################################################

	if unused_model_args:
	raise ValueError(
	f"The following `model_kwargs` are not used by the model: {unused_model_args} (note: typos in the"
	" generate arguments will also show up in this list)"
	)