AFM-4.5B-Base-KDA-Only / modeling_arcee_kda.py

Upload ArceeKDAForCausalLM

98777af verified 4 months ago

23.1 kB

	# coding=utf-8
	# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
	#
	# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
	# and OPT implementations in this library. It has been modified from its
	# original forms to accommodate minor architectural differences compared
	# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import math
	from typing import Callable, List, Optional, Tuple, Union, Any

	import torch
	from torch import nn
	from torch.nn import functional as F
	from einops import rearrange

	from transformers.activations import ACT2FN
	from transformers.generation import GenerationMixin
	from transformers.modeling_attn_mask_utils import AttentionMaskConverter
	from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
	from transformers.modeling_outputs import (
	BaseModelOutputWithPast,
	CausalLMOutputWithPast,
	)
	from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
	from transformers.processing_utils import Unpack
	from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
	from transformers.utils import (
	TransformersKwargs,
	add_start_docstrings,
	logging,
	replace_return_docstrings,
	auto_docstring,
	)
	from transformers.utils.generic import check_model_inputs
	from .configuration_arcee_kda import ArceeKDAConfig


	try:
	from fla.layers.kda import KimiDeltaAttention
	from fla.models.utils import Cache
	except ImportError as e:
	print(e)
	raise ImportError("Plese run `pip install -U flash-linear-attention fla-core`")

	logger = logging.get_logger(__name__)

	_CHECKPOINT_FOR_DOC = "arcee-train/Arcee-4B-Base"
	_CONFIG_FOR_DOC = "ArceeKDAConfig"


	class ArceeRMSNorm(nn.Module):
	def __init__(self, hidden_size, eps=1e-6):
	"""
	ArceeRMSNorm is equivalent to T5LayerNorm
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states):
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
	return self.weight * hidden_states.to(input_dtype)

	def extra_repr(self):
	return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


	ALL_LAYERNORM_LAYERS.append(ArceeRMSNorm)


	class ArceeMLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	self.intermediate_size = config.intermediate_size
	self.up_proj = nn.Linear(
	self.hidden_size, self.intermediate_size, bias=config.mlp_bias
	)
	self.down_proj = nn.Linear(
	self.intermediate_size, self.hidden_size, bias=config.mlp_bias
	)
	self.act_fn = ACT2FN[config.hidden_act]

	def forward(self, x):
	down_proj = self.down_proj(self.act_fn(self.up_proj(x)))
	return down_proj


	def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""
	This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
	num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
	"""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :, None, :, :].expand(
	batch, num_key_value_heads, n_rep, slen, head_dim
	)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


	def eager_attention_forward(
	module: nn.Module,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	attention_mask: Optional[torch.Tensor],
	scaling: float,
	dropout: float = 0.0,
	**kwargs,
	):
	key_states = repeat_kv(key, module.num_key_value_groups)
	value_states = repeat_kv(value, module.num_key_value_groups)

	attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
	if attention_mask is not None:
	causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
	attn_weights = attn_weights + causal_mask

	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
	query.dtype
	)
	attn_weights = nn.functional.dropout(
	attn_weights, p=dropout, training=module.training
	)
	attn_output = torch.matmul(attn_weights, value_states)
	attn_output = attn_output.transpose(1, 2).contiguous()

	return attn_output, attn_weights


	class ArceeNopeAttention(nn.Module):
	def __init__(self, config: ArceeKDAConfig, layer_idx: int):
	super().__init__()
	self.config = config
	self.layer_idx = layer_idx
	self.head_dim = getattr(
	config, "head_dim", config.hidden_size // config.num_attention_heads
	)
	self.num_key_value_groups = (
	config.num_attention_heads // config.num_key_value_heads
	)
	self.scaling = self.head_dim**-0.5
	self.attention_dropout = config.attention_dropout
	self.is_causal = True

	self.q_proj = nn.Linear(
	config.hidden_size,
	config.num_attention_heads * self.head_dim,
	bias=config.attention_bias,
	)
	self.k_proj = nn.Linear(
	config.hidden_size,
	config.num_key_value_heads * self.head_dim,
	bias=config.attention_bias,
	)
	self.v_proj = nn.Linear(
	config.hidden_size,
	config.num_key_value_heads * self.head_dim,
	bias=config.attention_bias,
	)
	self.o_proj = nn.Linear(
	config.num_attention_heads * self.head_dim,
	config.hidden_size,
	bias=config.attention_bias,
	)

	self.gate_proj = nn.Linear(
	config.hidden_size, config.num_attention_heads * self.head_dim, bias=False
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor],
	past_key_values: Optional[Cache] = None,
	**kwargs: Unpack[FlashAttentionKwargs],
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
	input_shape = hidden_states.shape[:-1]
	hidden_shape = (*input_shape, -1, self.head_dim)

	query_states = self.q_proj(hidden_states).view(hidden_shape)
	key_states = self.k_proj(hidden_states).view(hidden_shape)
	value_states = self.v_proj(hidden_states).view(hidden_shape)
	gate_states = self.gate_proj(hidden_states)

	if past_key_values is not None:
	cache_has_content = past_key_values.get_seq_length(self.layer_idx) > 0
	k_cached, v_cached = past_key_values.update(
	attn_state=(key_states.flatten(-2, -1), value_states.flatten(-2, -1)),
	layer_idx=self.layer_idx,
	offset=hidden_states.shape[1],
	cache_kwargs=dict(window_size=None),
	)["attn_state"]
	if cache_has_content:
	batch_size = key_states.shape[0]
	key_states = k_cached.view(
	batch_size, -1, self.config.num_key_value_heads, self.head_dim
	)
	value_states = v_cached.view(
	batch_size, -1, self.config.num_key_value_heads, self.head_dim
	)

	attention_interface: Callable = eager_attention_forward
	if self.config._attn_implementation != "eager":
	attention_interface = ALL_ATTENTION_FUNCTIONS[
	self.config._attn_implementation
	]

	attn_output, attn_weights = attention_interface(
	self,
	query_states.transpose(1, 2),
	key_states.transpose(1, 2),
	value_states.transpose(1, 2),
	attention_mask,
	dropout=0.0 if not self.training else self.attention_dropout,
	scaling=self.scaling,
	**kwargs,
	)

	attn_output = attn_output.reshape(*input_shape, -1).contiguous()
	attn_output = attn_output * F.sigmoid(gate_states)
	attn_output = self.o_proj(attn_output)
	return attn_output, attn_weights, past_key_values


	class ArceeKDADecoderLayer(nn.Module):
	def __init__(self, config: ArceeKDAConfig, layer_idx: int):
	super().__init__()
	self.hidden_size = config.hidden_size

	if config.is_kda_layer(layer_idx):
	self.self_attn = KimiDeltaAttention(
	layer_idx=layer_idx,
	hidden_size=config.hidden_size,
	**config.linear_attn_config,
	)
	self.is_linear_attn = True
	else:
	self.self_attn = ArceeNopeAttention(config=config, layer_idx=layer_idx)
	self.is_linear_attn = False

	self.mlp = ArceeMLP(config)
	self.input_layernorm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.post_attention_layernorm = ArceeRMSNorm(
	config.hidden_size, eps=config.rms_norm_eps
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	past_key_values: Optional[Cache] = None,
	use_cache: Optional[bool] = False,
	**kwargs: Unpack[FlashAttentionKwargs],
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
	residual = hidden_states

	hidden_states = self.input_layernorm(hidden_states)

	# Self Attention
	hidden_states, attn_weights, past_key_values = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	use_cache=use_cache,
	**kwargs,
	)
	hidden_states = residual + hidden_states

	# Fully Connected
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	return (hidden_states, attn_weights, past_key_values)


	Arcee_START_DOCSTRING = r"""
	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)

	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.

	Parameters:
	config ([`ArceeKDAConfig`]):
	Model configuration class with all the parameters of the model. Initializing with a config file does not
	load the weights associated with the model, only the configuration. Check out the
	[`~PreTrainedModel.from_pretrained`] method to load the model weights.
	"""


	@add_start_docstrings(
	"The bare Arcee-KDA Model outputting raw hidden-states without any specific head on top.",
	Arcee_START_DOCSTRING,
	)
	class ArceeKDAPreTrainedModel(PreTrainedModel):
	config_class = ArceeKDAConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["ArceeKDADecoderLayer"]
	_skip_keys_device_placement = ["past_key_values"]
	_supports_flash_attn_2 = True
	_supports_sdpa = True
	_supports_flex_attn = True
	_supports_attention_backend = True
	_can_record_outputs = {
	"hidden_states": ArceeKDADecoderLayer,
	"attentions": ArceeNopeAttention,
	}
	_is_stateful = True
	_supports_cache_class = True

	def _init_weights(self, module):
	std = self.config.initializer_range
	if (
	isinstance(module, KimiDeltaAttention)
	and next(module.parameters()).device.type != "meta"
	):
	with torch.no_grad():
	module.A_log.copy_(nn.init.uniform_(module.A_log, a=1, b=16).log())
	dt = torch.exp(
	nn.init.uniform_(module.dt_bias) * (math.log(0.1) - math.log(0.001))
	+ math.log(0.001),
	).clamp(min=1e-4)
	inv_dt = dt + torch.log(-torch.expm1(-dt))
	module.dt_bias.copy_(inv_dt)
	module.dt_bias._is_hf_initialized = True
	if isinstance(module, (nn.Linear, nn.Conv1d)):
	# Slightly different from the TF version which uses truncated_normal for initialization
	# cf https://github.com/pytorch/pytorch/pull/5617
	nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
	if module.bias is not None and not getattr(
	module.bias, "_is_hf_initialized", False
	):
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
	elif hasattr(module, "reset_parameters"):
	module.reset_parameters()


	@add_start_docstrings(
	"The bare Arcee-KDA Model outputting raw hidden-states without any specific head on top.",
	Arcee_START_DOCSTRING,
	)
	class ArceeKDAModel(ArceeKDAPreTrainedModel):
	"""
	Transformer decoder consisting of config.num_hidden_layers layers. Each layer is a [`ArceeKDADecoderLayer`]

	Args:
	config: ArceeKDAConfig
	"""

	def __init__(self, config: ArceeKDAConfig):
	super().__init__(config)
	self.padding_idx = config.pad_token_id
	self.vocab_size = config.vocab_size

	self.embed_tokens = nn.Embedding(
	config.vocab_size, config.hidden_size, self.padding_idx
	)
	self.layers = nn.ModuleList(
	[
	ArceeKDADecoderLayer(config, layer_idx)
	for layer_idx in range(config.num_hidden_layers)
	]
	)
	self.norm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.gradient_checkpointing = False

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.embed_tokens

	def set_input_embeddings(self, value):
	self.embed_tokens = value

	@auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	**kwargs: Unpack[dict],
	) -> Union[Tuple, BaseModelOutputWithPast]:
	output_attentions = (
	output_attentions
	if output_attentions is not None
	else self.config.output_attentions
	)
	output_hidden_states = (
	output_hidden_states
	if output_hidden_states is not None
	else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = (
	return_dict if return_dict is not None else self.config.use_return_dict
	)

	if (input_ids is None) ^ (inputs_embeds is not None):
	raise ValueError(
	"You must specify exactly one of input_ids or inputs_embeds"
	)

	if self.gradient_checkpointing and self.training and use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
	)
	use_cache = False

	if inputs_embeds is None:
	inputs_embeds = self.embed_tokens(input_ids)

	if use_cache and not isinstance(past_key_values, Cache):
	past_key_values = Cache.from_legacy_cache(past_key_values)

	hidden_states = inputs_embeds

	# decoder layers
	all_hidden_states = () if output_hidden_states else None
	all_self_attns = () if output_attentions else None

	for decoder_layer in self.layers[: self.config.num_hidden_layers]:
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	if self.gradient_checkpointing and self.training:
	layer_outputs = self._gradient_checkpointing_func(
	decoder_layer.__call__,
	hidden_states,
	attention_mask,
	past_key_values,
	use_cache,
	)
	else:
	layer_outputs = decoder_layer(
	hidden_states,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	use_cache=use_cache,
	**kwargs,
	)

	hidden_states, attn_weights, past_key_values = layer_outputs

	if output_attentions:
	all_self_attns += (attn_weights,)

	hidden_states = self.norm(hidden_states)

	# add hidden states from the last decoder layer
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	output = BaseModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=past_key_values,
	hidden_states=all_hidden_states,
	attentions=all_self_attns,
	)
	return output if return_dict else output.to_tuple()


	class ArceeKDAForCausalLM(ArceeKDAPreTrainedModel, GenerationMixin):
	_tied_weights_keys = ["lm_head.weight"]
	_tp_plan = {"lm_head": "colwise_rep"}
	_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

	def __init__(self, config):
	super().__init__(config)
	self.model = ArceeKDAModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def set_decoder(self, decoder):
	self.model = decoder

	def get_decoder(self):
	return self.model

	@auto_docstring
	@replace_return_docstrings(
	output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
	)
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> Union[Tuple, CausalLMOutputWithPast]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, ArceeForCausalLM

	>>> model = ArceeForCausalLM.from_pretrained("arcee-train/Arcee-4B-Base")
	>>> tokenizer = AutoTokenizer.from_pretrained("arcee-train/Arcee-4B-Base")

	>>> prompt = "Hey, are you conscious? Can you talk to me?"
	>>> inputs = tokenizer(prompt, return_tensors="pt")

	>>> # Generate
	>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
	>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
	```"""
	output_attentions = (
	output_attentions
	if output_attentions is not None
	else self.config.output_attentions
	)
	output_hidden_states = (
	output_hidden_states
	if output_hidden_states is not None
	else self.config.output_hidden_states
	)
	return_dict = (
	return_dict if return_dict is not None else self.config.use_return_dict
	)

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	**kwargs,
	)

	hidden_states = outputs[0]
	logits = self.lm_head(hidden_states)

	loss = None
	if labels is not None:
	loss = self.loss_function(
	logits=logits,
	labels=labels,
	vocab_size=self.config.vocab_size,
	**kwargs,
	)

	if not return_dict:
	output = (logits,) + outputs[1:]
	return (loss,) + output if loss is not None else output

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)


	__all__ = [
	"ArceeKDAForCausalLM",
	"ArceeKDAModel",
	"ArceeKDAPreTrainedModel",
	]