Upload folder using huggingface_hub

2720487 verified about 1 year ago

21.2 kB

	import copy
	from typing import Optional, List, Union, Tuple

	from transformers import MBartForCausalLM, MBartConfig
	from torch import nn
	from transformers.activations import ACT2FN
	from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions, BaseModelOutputWithPastAndCrossAttentions
	from transformers.models.mbart.modeling_mbart import MBartPreTrainedModel, MBartDecoder
	from .config import MBartMoEConfig
	import torch
	import math


	class MBartLearnedPositionalEmbedding(nn.Embedding):
	"""
	This module learns positional embeddings up to a fixed maximum size.
	"""

	def __init__(self, num_embeddings: int, embedding_dim: int):
	# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
	# and adjust num_embeddings appropriately. Other models don't have this hack
	self.offset = 2
	super().__init__(num_embeddings + self.offset, embedding_dim)

	def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
	"""`input_ids' shape is expected to be [bsz x seqlen]."""

	bsz, seq_len = input_ids.shape[:2]
	positions = torch.arange(
	past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
	).expand(bsz, -1)

	return super().forward(positions + self.offset)


	class MBartExpertMLP(nn.Module):
	def __init__(self, config: MBartConfig, is_lg=False, is_xl=False):
	super().__init__()
	self.ffn_dim = config.d_expert
	if is_lg:
	self.ffn_dim = config.d_expert_lg
	if is_xl:
	self.ffn_dim = config.d_expert_xl
	self.hidden_dim = config.d_model

	self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
	self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
	self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
	self.dropout = nn.Dropout(config.activation_dropout)

	self.act_fn = ACT2FN[config.activation_function]

	def forward(self, hidden_states):
	current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
	current_hidden_states = self.w2(current_hidden_states)
	return current_hidden_states


	class MBartExpertLayer(nn.Module):
	# From mixtral, with modifications
	def __init__(self, config):
	super().__init__()
	self.dropout = nn.Dropout(config.activation_dropout)

	self.hidden_dim = config.d_model

	self.lg_lang_codes = sorted(config.lg_langs.values()) if hasattr(config, "lg_langs") else []
	self.xl_lang_codes = sorted(config.xl_langs.values()) if hasattr(config, "xl_langs") else []

	self.lang_codes = sorted(config.langs.values())
	self.num_experts = len(self.lang_codes)

	self.experts = nn.ModuleDict({str(lang): MBartExpertMLP(config, is_lg=(lang in self.lg_lang_codes), is_xl=(lang in self.xl_lang_codes)) for lang in self.lang_codes})

	def forward(self, hidden_states: torch.Tensor, langs: torch.LongTensor) -> torch.Tensor:
	batch_size, sequence_length, hidden_dim = hidden_states.shape

	final_hidden_states = torch.zeros(
	(batch_size, sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
	)

	# Weight experts based on how many languages in the input
	routing_weights = 1 / ((langs > 3).sum(axis=-1))
	# Set weights to 1 if zero experts activated
	routing_weights[torch.isinf(routing_weights)] = 1

	unique_langs = langs.unique(dim=None, sorted=True)
	unique_langs = unique_langs[unique_langs > 3] # Remove start token

	# Loop over all available experts in the model and perform the computation on each expert
	for expert_lang in unique_langs:
	# Check which samples match with this expert
	lang_match = (langs == expert_lang).any(dim=-1)
	idx = torch.nonzero(lang_match, as_tuple=True)[0]

	if idx.shape[0] == 0:
	continue

	expert_layer = self.experts[str(expert_lang.item())]

	current_state = hidden_states[idx]
	current_hidden_states = expert_layer(current_state.view(-1, hidden_dim))
	current_hidden_states = current_hidden_states.view(-1, sequence_length, hidden_dim)

	# Weight by number of languages in the input
	selected_routing_weights = routing_weights[idx].view(-1, 1, 1)
	current_hidden_states *= selected_routing_weights

	final_hidden_states.index_add_(0, idx, current_hidden_states)

	return final_hidden_states


	class MBartGQAttention(nn.Module):
	def __init__(
	self,
	embed_dim: int,
	num_heads: int,
	num_kv_heads: int,
	dropout: float = 0.0,
	is_decoder: bool = False,
	bias: bool = True,
	is_causal: bool = False,
	config: Optional[MBartConfig] = None,
	):
	super().__init__()
	self.embed_dim = embed_dim
	self.num_heads = num_heads
	self.num_kv_heads = num_kv_heads
	self.num_kv_groups = self.num_heads // self.num_kv_heads

	self.dropout = dropout
	self.head_dim = embed_dim // num_heads
	self.config = config
	self.scaling = self.head_dim**-0.5
	self.is_decoder = is_decoder
	self.is_causal = is_causal

	self.k_proj = nn.Linear(embed_dim, self.num_kv_heads * self.head_dim, bias=bias)
	self.v_proj = nn.Linear(embed_dim, self.num_kv_heads * self.head_dim, bias=bias)
	self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
	self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

	def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
	return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

	def _shape_key_value(self, tensor: torch.Tensor, seq_len: int, bsz: int):
	return tensor.view(bsz, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2).contiguous()

	def forward(
	self,
	hidden_states: torch.Tensor,
	key_value_states: Optional[torch.Tensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	is_prefill: Optional[bool] = False,
	attention_mask: Optional[torch.Tensor] = None,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	"""Input shape: Batch x Time x Channel"""

	# if key_value_states are provided this layer is used as a cross-attention layer
	# for the decoder
	is_cross_attention = key_value_states is not None

	bsz, tgt_len, _ = hidden_states.size()

	# get query proj
	query_states = self.q_proj(hidden_states) * self.scaling
	# get key, value proj
	# `past_key_value[0].shape[2] == key_value_states.shape[1]`
	# is checking that the `sequence_length` of the `past_key_value` is the same as
	# the provided `key_value_states` to support prefix tuning
	if is_cross_attention:
	if is_prefill:
	# cross_attentions
	key_states = self._shape_key_value(self.k_proj(key_value_states), -1, bsz)
	value_states = self._shape_key_value(self.v_proj(key_value_states), -1, bsz)
	past_key_value = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0)
	else:
	# reuse k,v, cross_attentions
	key_states = past_key_value[0]
	value_states = past_key_value[1]
	past_key_value = None
	# Self-attention
	else:
	if is_prefill:
	# initial prompt
	key_states = self._shape_key_value(self.k_proj(hidden_states), -1, bsz)
	value_states = self._shape_key_value(self.v_proj(hidden_states), -1, bsz)
	past_key_value = torch.cat([key_states[:, :, -tgt_len:].unsqueeze(0), value_states[:, :, -tgt_len:].unsqueeze(0)], dim=0)
	else:
	# reuse k, v, self_attention
	key_states = self._shape_key_value(self.k_proj(hidden_states), -1, bsz)
	value_states = self._shape_key_value(self.v_proj(hidden_states), -1, bsz)
	key_states = torch.cat([past_key_value[0], key_states], dim=2)
	value_states = torch.cat([past_key_value[1], value_states], dim=2)
	past_key_value = torch.cat([key_states[:, :, -tgt_len:].unsqueeze(0), value_states[:, :, -tgt_len:].unsqueeze(0)], dim=0)

	proj_shape = (bsz * self.num_heads, -1, self.head_dim)
	query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)

	# Expand kv heads, then match query shape
	key_states = key_states.repeat_interleave(self.num_kv_groups, dim=1).reshape(*proj_shape)
	value_states = value_states.repeat_interleave(self.num_kv_groups, dim=1).reshape(*proj_shape)

	src_len = key_states.size(1)
	attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

	if not is_cross_attention:
	attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
	attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

	attn_weights = nn.functional.softmax(attn_weights, dim=-1)

	attn_output = torch.bmm(attn_weights, value_states).view(bsz, self.num_heads, tgt_len, self.head_dim).transpose(1,2)

	# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
	# partitioned across GPUs when using tensor-parallelism.
	attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
	attn_output = self.out_proj(attn_output)

	return attn_output, past_key_value


	class MBartMoEDecoderLayer(nn.Module):
	def __init__(self, config: MBartConfig, has_moe=False):
	super().__init__()
	self.embed_dim = config.d_model

	self.self_attn = MBartGQAttention(
	embed_dim=self.embed_dim,
	num_heads=config.decoder_attention_heads,
	num_kv_heads=config.kv_heads,
	dropout=config.attention_dropout,
	is_decoder=True,
	is_causal=True,
	config=config,
	)
	self.dropout = config.dropout
	self.activation_fn = ACT2FN[config.activation_function]
	self.activation_dropout = config.activation_dropout

	self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
	self.encoder_attn = MBartGQAttention(
	self.embed_dim,
	config.decoder_attention_heads,
	num_kv_heads=config.kv_heads,
	dropout=config.attention_dropout,
	is_decoder=True,
	config=config,
	)
	self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
	self.has_moe = has_moe
	if has_moe:
	self.moe = MBartExpertLayer(config)
	else:
	self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
	self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
	self.final_layer_norm = nn.LayerNorm(self.embed_dim)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	langs: Optional[torch.LongTensor] = None,
	self_kv_cache: Optional[torch.Tensor] = None,
	cross_kv_cache: Optional[torch.Tensor] = None,
	is_prefill: Optional[bool] = False,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	encoder_attention_mask: Optional[torch.Tensor] = None,
	use_cache: Optional[bool] = True,
	) -> torch.Tensor:
	residual = hidden_states
	hidden_states = self.self_attn_layer_norm(hidden_states)

	# Self Attention
	# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
	# add present self-attn cache to positions 1,2 of present_key_value tuple
	hidden_states, present_key_value = self.self_attn(
	hidden_states=hidden_states,
	past_key_value=self_kv_cache,
	is_prefill=is_prefill,
	attention_mask=attention_mask,
	)
	hidden_states = residual + hidden_states

	# Cross-Attention Block
	if encoder_hidden_states is not None:
	residual = hidden_states
	hidden_states = self.encoder_attn_layer_norm(hidden_states)

	# cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
	hidden_states, cross_attn_present_key_value = self.encoder_attn(
	hidden_states=hidden_states,
	key_value_states=encoder_hidden_states,
	is_prefill=is_prefill,
	attention_mask=encoder_attention_mask,
	past_key_value=cross_kv_cache,
	)
	hidden_states = residual + hidden_states

	# add cross-attn to positions 3,4 of present_key_value tuple
	present_key_value = (present_key_value, cross_attn_present_key_value)

	# Fully Connected
	residual = hidden_states
	hidden_states = self.final_layer_norm(hidden_states)
	if self.has_moe:
	hidden_states = self.moe(hidden_states, langs)
	else:
	hidden_states = self.activation_fn(self.fc1(hidden_states))
	hidden_states = self.fc2(hidden_states)

	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if use_cache:
	outputs += (present_key_value,)

	return outputs


	class MBartMoEDecoder(MBartDecoder):
	def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
	MBartPreTrainedModel.__init__(self, config)
	self.dropout = config.dropout
	self.layerdrop = config.decoder_layerdrop
	self.padding_idx = config.pad_token_id
	self.max_target_positions = config.max_position_embeddings
	self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

	self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

	if embed_tokens is not None:
	self.embed_tokens.weight = embed_tokens.weight

	self.embed_positions = MBartLearnedPositionalEmbedding(
	config.max_position_embeddings,
	config.d_model,
	)
	# Language-specific MoE goes at second and second-to-last layer
	self.layers = nn.ModuleList([MBartMoEDecoderLayer(config, has_moe=(i in config.moe_layers) and config.use_moe) for i in range(config.decoder_layers)])
	self.layernorm_embedding = nn.LayerNorm(config.d_model)
	self.layer_norm = nn.LayerNorm(config.d_model)

	self.gradient_checkpointing = False
	# Initialize weights and apply final processing
	self.post_init()

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	self_kv_cache: Optional[torch.Tensor] = None,
	cross_kv_cache: Optional[torch.Tensor] = None,
	past_token_count: Optional[int] = None,
	langs: Optional[torch.LongTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
	use_cache = True
	return_dict = True

	input = input_ids
	input_shape = input.size()
	input_ids = input_ids.view(-1, input_shape[-1])

	# past_key_values_length
	past_key_values_length = past_token_count if self_kv_cache is not None else 0
	inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

	# embed positions
	positions = self.embed_positions(input, past_key_values_length)

	hidden_states = inputs_embeds + positions
	hidden_states = self.layernorm_embedding(hidden_states)

	# decoder layers
	all_hidden_states = None
	all_self_attns = None
	all_cross_attentions = None
	next_decoder_cache = () if use_cache else None

	for idx, decoder_layer in enumerate(self.layers):
	is_prefill = past_token_count == 0
	layer_self_kv_cache = self_kv_cache[idx] if self_kv_cache is not None else None
	layer_cross_kv_cache = cross_kv_cache[idx] if cross_kv_cache is not None else None
	layer_outputs = decoder_layer(
	hidden_states,
	attention_mask=attention_mask,
	langs=langs,
	self_kv_cache=layer_self_kv_cache,
	cross_kv_cache=layer_cross_kv_cache,
	is_prefill=is_prefill,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=None,
	use_cache=use_cache,
	)
	hidden_states = layer_outputs[0]

	if use_cache:
	next_decoder_cache += (layer_outputs[1],)

	hidden_states = self.layer_norm(hidden_states)

	next_cache = next_decoder_cache if use_cache else None
	if not return_dict:
	return tuple(
	v
	for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
	if v is not None
	)
	return BaseModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	past_key_values=next_cache,
	hidden_states=all_hidden_states,
	attentions=all_self_attns,
	cross_attentions=all_cross_attentions,
	)


	class MBartMoEDecoderWrapper(MBartPreTrainedModel):
	"""
	This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
	used in combination with the [`EncoderDecoderModel`] framework.
	"""

	def __init__(self, config):
	super().__init__(config)
	self.decoder = MBartMoEDecoder(config)

	def forward(self, args, *kwargs):
	return self.decoder(args, *kwargs)


	class MBartMoE(MBartForCausalLM):
	config_class = MBartMoEConfig
	_tied_weights_keys = ["lm_head.weight"]

	def __init__(self, config, **kwargs):
	config = copy.deepcopy(config)
	config.is_decoder = True
	config.is_encoder_decoder = False
	MBartPreTrainedModel.__init__(self, config)
	self.model = MBartMoEDecoderWrapper(config)

	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	self_kv_cache: Optional[torch.FloatTensor] = None,
	cross_kv_cache: Optional[torch.FloatTensor] = None,
	past_token_count: Optional[int] = None,
	langs: Optional[torch.LongTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	head_mask: Optional[torch.Tensor] = None,
	cross_attn_head_mask: Optional[torch.Tensor] = None,
	past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	**kwargs
	) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model.decoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	self_kv_cache=self_kv_cache,
	cross_kv_cache=cross_kv_cache,
	past_token_count=past_token_count,
	langs=langs,
	encoder_hidden_states=encoder_hidden_states,
	)

	logits = self.lm_head(outputs[0])

	if not return_dict:
	output = (logits,) + outputs[1:]
	return output

	return CausalLMOutputWithCrossAttentions(
	loss=None,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	cross_attentions=outputs.cross_attentions,
	)

	def prune_moe_experts(self, keep_keys: List[int]):
	# Remove experts not specified in keep_keys
	str_keep_keys = [str(key) for key in keep_keys]
	for layer in self.model.decoder.layers:
	if not layer.has_moe:
	continue

	lang_keys = list(layer.moe.experts.keys())
	for lang in lang_keys:
	if lang not in str_keep_keys:
	layer.moe.experts.pop(lang)
	layer.lang_codes = keep_keys