|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """ PyTorch OPT model."""
|
| import random
|
| from typing import List, Optional, Tuple, Union
|
|
|
| import torch
|
| import torch.utils.checkpoint
|
| from torch import nn
|
| from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
|
| from transformers.activations import ACT2FN
|
| from transformers.modeling_outputs import (
|
| BaseModelOutputWithPast,
|
| CausalLMOutputWithPast,
|
| )
|
| from transformers.modeling_utils import PreTrainedModel
|
| from transformers.utils import (
|
| add_code_sample_docstrings,
|
| add_start_docstrings,
|
| add_start_docstrings_to_model_forward,
|
| logging,
|
| replace_return_docstrings,
|
| )
|
| from transformers.models.opt.configuration_opt import OPTConfig
|
|
|
|
|
| logger = logging.get_logger(__name__)
|
|
|
| _CHECKPOINT_FOR_DOC = "facebook/opt-350m"
|
| _CONFIG_FOR_DOC = "OPTConfig"
|
| _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
|
|
|
|
|
| _EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
|
|
|
|
|
| _CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ArthurZ/opt-350m-dummy-sc"
|
| _SEQ_CLASS_EXPECTED_LOSS = 1.71
|
| _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'"
|
|
|
|
|
| _QA_EXPECTED_OUTPUT = "'a nice puppet'"
|
| _QA_EXPECTED_LOSS = 7.41
|
| _QA_TARGET_START_INDEX = 14
|
| _QA_TARGET_END_INDEX = 15
|
|
|
| OPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
| "facebook/opt-125m",
|
| "facebook/opt-350m",
|
| "facebook/opt-1.3b",
|
| "facebook/opt-2.7b",
|
| "facebook/opt-6.7b",
|
| "facebook/opt-13b",
|
| "facebook/opt-30b",
|
|
|
| ]
|
|
|
|
|
| def _make_causal_mask(
|
| input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0
|
| ):
|
| """
|
| Make causal mask used for bi-directional self-attention.
|
| """
|
| bsz, tgt_len = input_ids_shape
|
| mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
|
| mask_cond = torch.arange(mask.size(-1))
|
| mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
|
| mask = mask.to(dtype)
|
|
|
| if past_key_values_length > 0:
|
| mask = torch.cat(
|
| [torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1
|
| )
|
| return mask[None, None, :, :].expand(
|
| bsz, 1, tgt_len, tgt_len + past_key_values_length
|
| )
|
|
|
|
|
| def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
| """
|
| Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
|
| """
|
| bsz, src_len = mask.size()
|
| tgt_len = tgt_len if tgt_len is not None else src_len
|
|
|
| expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
|
|
|
| inverted_mask = 1.0 - expanded_mask
|
|
|
| return inverted_mask.masked_fill(
|
| inverted_mask.to(torch.bool), torch.finfo(dtype).min
|
| )
|
|
|
|
|
| class OPTLearnedPositionalEmbedding(nn.Embedding):
|
| """
|
| This module learns positional embeddings up to a fixed maximum size.
|
| """
|
|
|
| def __init__(self, num_embeddings: int, embedding_dim: int):
|
|
|
|
|
| self.offset = 2
|
| super().__init__(num_embeddings + self.offset, embedding_dim)
|
|
|
| def forward(
|
| self, attention_mask: torch.LongTensor, past_key_values_length: int = 0
|
| ):
|
| """`input_ids_shape` is expected to be [bsz x seqlen]."""
|
| attention_mask = attention_mask.long()
|
|
|
|
|
| positions = (
|
| torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
|
| ).long() - 1
|
|
|
|
|
| positions = positions[:, past_key_values_length:]
|
|
|
| return super().forward(positions + self.offset)
|
|
|
|
|
| class OPTAttention(nn.Module):
|
| """Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
|
| def __init__(
|
| self,
|
| embed_dim: int,
|
| num_heads: int,
|
| dropout: float = 0.0,
|
| is_decoder: bool = False,
|
| bias: bool = True,
|
| ):
|
| super().__init__()
|
| self.embed_dim = embed_dim
|
| self.num_heads = num_heads
|
| self.dropout = dropout
|
| self.head_dim = embed_dim // num_heads
|
|
|
| if (self.head_dim * num_heads) != self.embed_dim:
|
| raise ValueError(
|
| f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
|
| f" and `num_heads`: {num_heads})."
|
| )
|
| self.scaling = self.head_dim**-0.5
|
| self.is_decoder = is_decoder
|
|
|
| self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
| self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
| self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
| self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
|
|
| def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
|
| return (
|
| tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
|
| .transpose(1, 2)
|
| .contiguous()
|
| )
|
|
|
| def forward(
|
| self,
|
| hidden_states: torch.Tensor,
|
| key_value_states: Optional[torch.Tensor] = None,
|
| past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
| attention_mask: Optional[torch.Tensor] = None,
|
| layer_head_mask: Optional[torch.Tensor] = None,
|
| output_attentions: bool = False,
|
| ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
| """Input shape: Batch x Time x Channel"""
|
|
|
|
|
|
|
| is_cross_attention = key_value_states is not None
|
|
|
| bsz, tgt_len, _ = hidden_states.size()
|
|
|
|
|
| query_states = self.q_proj(hidden_states) * self.scaling
|
|
|
| if is_cross_attention and past_key_value is not None:
|
|
|
| key_states = past_key_value[0]
|
| value_states = past_key_value[1]
|
| elif is_cross_attention:
|
|
|
| key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
|
| value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
|
| elif past_key_value is not None:
|
|
|
| key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
|
| value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
|
| key_states = torch.cat([past_key_value[0], key_states], dim=2)
|
| value_states = torch.cat([past_key_value[1], value_states], dim=2)
|
| else:
|
|
|
| key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
|
| value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
|
|
|
| if self.is_decoder:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| past_key_value = (key_states, value_states)
|
|
|
| proj_shape = (bsz * self.num_heads, -1, self.head_dim)
|
| query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
|
| key_states = key_states.view(*proj_shape)
|
| value_states = value_states.view(*proj_shape)
|
|
|
| src_len = key_states.size(1)
|
| attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
|
|
|
| if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
| raise ValueError(
|
| f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
|
| f" {attn_weights.size()}"
|
| )
|
|
|
| if attention_mask is not None:
|
| if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
| raise ValueError(
|
| f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
|
| )
|
| attn_weights = (
|
| attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
| + attention_mask
|
| )
|
| attn_weights = torch.max(
|
| attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
|
| )
|
| attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
|
|
|
|
| if attn_weights.dtype == torch.float16:
|
| attn_weights = nn.functional.softmax(
|
| attn_weights, dim=-1, dtype=torch.float32
|
| ).to(torch.float16)
|
| else:
|
| attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
|
|
| if layer_head_mask is not None:
|
| if layer_head_mask.size() != (self.num_heads,):
|
| raise ValueError(
|
| f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
|
| f" {layer_head_mask.size()}"
|
| )
|
| attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
|
| bsz, self.num_heads, tgt_len, src_len
|
| )
|
| attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
|
|
| if output_attentions:
|
|
|
|
|
|
|
|
|
| attn_weights_reshaped = attn_weights.view(
|
| bsz, self.num_heads, tgt_len, src_len
|
| )
|
| attn_weights = attn_weights_reshaped.view(
|
| bsz * self.num_heads, tgt_len, src_len
|
| )
|
| else:
|
| attn_weights_reshaped = None
|
|
|
| attn_probs = nn.functional.dropout(
|
| attn_weights, p=self.dropout, training=self.training
|
| )
|
|
|
| attn_output = torch.bmm(attn_probs, value_states)
|
|
|
| if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
|
| raise ValueError(
|
| f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
|
| f" {attn_output.size()}"
|
| )
|
|
|
| attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
|
| attn_output = attn_output.transpose(1, 2)
|
|
|
|
|
|
|
| attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
|
|
|
| attn_output = self.out_proj(attn_output)
|
|
|
| return attn_output, attn_weights_reshaped, past_key_value
|
|
|
|
|
| class OPTDecoderLayer(nn.Module):
|
| def __init__(self, config: OPTConfig):
|
| super().__init__()
|
| self.embed_dim = config.hidden_size
|
| self.self_attn = OPTAttention(
|
| embed_dim=self.embed_dim,
|
| num_heads=config.num_attention_heads,
|
| dropout=config.attention_dropout,
|
| is_decoder=True,
|
| )
|
| self.do_layer_norm_before = config.do_layer_norm_before
|
| self.dropout = config.dropout
|
| self.activation_fn = ACT2FN[config.activation_function]
|
|
|
| self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
|
| self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim)
|
| self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim)
|
| self.final_layer_norm = nn.LayerNorm(self.embed_dim)
|
|
|
| def forward(
|
| self,
|
| hidden_states: torch.Tensor,
|
| attention_mask: Optional[torch.Tensor] = None,
|
| layer_head_mask: Optional[torch.Tensor] = None,
|
| output_attentions: Optional[bool] = False,
|
| use_cache: Optional[bool] = False,
|
| past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
| ) -> Tuple[
|
| torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
|
| ]:
|
| """
|
| Args:
|
| hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
| attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
|
| `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
|
| layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
|
| `(encoder_attention_heads,)`.
|
| output_attentions (`bool`, *optional*):
|
| Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
| returned tensors for more detail.
|
| use_cache (`bool`, *optional*):
|
| If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
| (see `past_key_values`).
|
| past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
|
| """
|
|
|
| residual = hidden_states
|
|
|
|
|
| if self.do_layer_norm_before:
|
| hidden_states = self.self_attn_layer_norm(hidden_states)
|
|
|
|
|
| hidden_states, self_attn_weights, present_key_value = self.self_attn(
|
| hidden_states=hidden_states,
|
| past_key_value=past_key_value,
|
| attention_mask=attention_mask,
|
| layer_head_mask=layer_head_mask,
|
| output_attentions=output_attentions,
|
| )
|
| hidden_states = nn.functional.dropout(
|
| hidden_states, p=self.dropout, training=self.training
|
| )
|
| hidden_states = residual + hidden_states
|
|
|
|
|
| if not self.do_layer_norm_before:
|
| hidden_states = self.self_attn_layer_norm(hidden_states)
|
|
|
|
|
| hidden_states_shape = hidden_states.shape
|
| hidden_states = hidden_states.reshape(-1, hidden_states.size(-1))
|
| residual = hidden_states
|
|
|
|
|
| if self.do_layer_norm_before:
|
| hidden_states = self.final_layer_norm(hidden_states)
|
|
|
| hidden_states = self.fc1(hidden_states)
|
| hidden_states = self.activation_fn(hidden_states)
|
|
|
| hidden_states = self.fc2(hidden_states)
|
| hidden_states = nn.functional.dropout(
|
| hidden_states, p=self.dropout, training=self.training
|
| )
|
|
|
| hidden_states = (residual + hidden_states).view(hidden_states_shape)
|
|
|
|
|
| if not self.do_layer_norm_before:
|
| hidden_states = self.final_layer_norm(hidden_states)
|
|
|
| outputs = (hidden_states,)
|
|
|
| if output_attentions:
|
| outputs += (self_attn_weights,)
|
|
|
| if use_cache:
|
| outputs += (present_key_value,)
|
|
|
| return outputs
|
|
|
|
|
| OPT_START_DOCSTRING = r"""
|
| This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| etc.)
|
|
|
| This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
| Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
| and behavior.
|
|
|
| Parameters:
|
| config ([`OPTConfig`]):
|
| Model configuration class with all the parameters of the model. Initializing with a config file does not
|
| load the weights associated with the model, only the configuration. Check out the
|
| [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
| """
|
|
|
|
|
| @add_start_docstrings(
|
| "The bare OPT Model outputting raw hidden-states without any specific head on top.",
|
| OPT_START_DOCSTRING,
|
| )
|
| class OPTPreTrainedModel(PreTrainedModel):
|
|
|
| config_class = OPTConfig
|
| base_model_prefix = "model"
|
| supports_gradient_checkpointing = True
|
| _no_split_modules = ["OPTDecoderLayer"]
|
| _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
|
|
|
| def _init_weights(self, module):
|
| std = self.config.init_std
|
| if isinstance(module, nn.Linear):
|
| module.weight.data.normal_(mean=0.0, std=std)
|
| if module.bias is not None:
|
| module.bias.data.zero_()
|
| elif isinstance(module, nn.Embedding):
|
| module.weight.data.normal_(mean=0.0, std=std)
|
| if module.padding_idx is not None:
|
| module.weight.data[module.padding_idx].zero_()
|
|
|
| def _set_gradient_checkpointing(self, module, value=False):
|
| if isinstance(module, (OPTDecoder)):
|
| module.gradient_checkpointing = value
|
|
|
|
|
| OPT_INPUTS_DOCSTRING = r"""
|
| Args:
|
| input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
| it.
|
|
|
| Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| [`PreTrainedTokenizer.__call__`] for details.
|
|
|
| [What are input IDs?](../glossary#input-ids)
|
| attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
|
| - 1 for tokens that are **not masked**,
|
| - 0 for tokens that are **masked**.
|
|
|
| [What are attention masks?](../glossary#attention-mask)
|
|
|
| Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| [`PreTrainedTokenizer.__call__`] for details.
|
|
|
| If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
|
| `past_key_values`).
|
|
|
| If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
| and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
| information on the default strategy.
|
| head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
|
| Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
|
|
|
| - 1 indicates the head is **not masked**,
|
| - 0 indicates the head is **masked**.
|
|
|
| past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
| Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
| `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
|
| `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
|
|
|
| Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
| blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
|
|
| If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
|
| don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
|
| `decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
| inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
| Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
| is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
| model's internal embedding lookup matrix.
|
| use_cache (`bool`, *optional*):
|
| If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
| `past_key_values`).
|
| output_attentions (`bool`, *optional*):
|
| Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
| tensors for more detail.
|
| output_hidden_states (`bool`, *optional*):
|
| Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
| more detail.
|
| return_dict (`bool`, *optional*):
|
| Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| """
|
|
|
|
|
| class OPTDecoder(OPTPreTrainedModel):
|
| """
|
| Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]
|
|
|
| Args:
|
| config: OPTConfig
|
| """
|
|
|
| def __init__(self, config: OPTConfig):
|
| super().__init__(config)
|
| self.dropout = config.dropout
|
| self.layerdrop = config.layerdrop
|
| self.padding_idx = config.pad_token_id
|
| self.max_target_positions = config.max_position_embeddings
|
| self.vocab_size = config.vocab_size
|
|
|
| self.embed_tokens = nn.Embedding(
|
| config.vocab_size, config.word_embed_proj_dim, self.padding_idx
|
| )
|
| self.embed_positions = OPTLearnedPositionalEmbedding(
|
| config.max_position_embeddings, config.hidden_size
|
| )
|
|
|
| if config.word_embed_proj_dim != config.hidden_size:
|
| self.project_out = nn.Linear(
|
| config.hidden_size, config.word_embed_proj_dim, bias=False
|
| )
|
| else:
|
| self.project_out = None
|
|
|
| if config.word_embed_proj_dim != config.hidden_size:
|
| self.project_in = nn.Linear(
|
| config.word_embed_proj_dim, config.hidden_size, bias=False
|
| )
|
| else:
|
| self.project_in = None
|
|
|
|
|
|
|
|
|
| if config.do_layer_norm_before and not config._remove_final_layer_norm:
|
| self.final_layer_norm = nn.LayerNorm(config.hidden_size)
|
| else:
|
| self.final_layer_norm = None
|
|
|
| self.layers = nn.ModuleList(
|
| [OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)]
|
| )
|
|
|
| self.gradient_checkpointing = False
|
|
|
| self.post_init()
|
|
|
| def get_input_embeddings(self):
|
| return self.embed_tokens
|
|
|
| def set_input_embeddings(self, value):
|
| self.embed_tokens = value
|
|
|
|
|
| def _prepare_decoder_attention_mask(
|
| self, attention_mask, input_shape, inputs_embeds, past_key_values_length
|
| ):
|
|
|
|
|
| combined_attention_mask = None
|
| if input_shape[-1] > 1:
|
| combined_attention_mask = _make_causal_mask(
|
| input_shape,
|
| inputs_embeds.dtype,
|
| past_key_values_length=past_key_values_length,
|
| ).to(inputs_embeds.device)
|
|
|
| if attention_mask is not None:
|
|
|
| expanded_attn_mask = _expand_mask(
|
| attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
|
| ).to(inputs_embeds.device)
|
| combined_attention_mask = (
|
| expanded_attn_mask
|
| if combined_attention_mask is None
|
| else expanded_attn_mask + combined_attention_mask
|
| )
|
|
|
| return combined_attention_mask
|
|
|
| def forward(
|
| self,
|
| input_ids: torch.LongTensor = None,
|
| attention_mask: Optional[torch.Tensor] = None,
|
| head_mask: Optional[torch.Tensor] = None,
|
| past_key_values: Optional[List[torch.FloatTensor]] = None,
|
| inputs_embeds: Optional[torch.FloatTensor] = None,
|
| query_embeds: Optional[torch.FloatTensor] = None,
|
| use_cache: Optional[bool] = None,
|
| output_attentions: Optional[bool] = None,
|
| output_hidden_states: Optional[bool] = None,
|
| return_dict: Optional[bool] = None,
|
| ) -> Union[Tuple, BaseModelOutputWithPast]:
|
| r"""
|
| Args:
|
| input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
|
| provide it.
|
|
|
| Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| [`PreTrainedTokenizer.__call__`] for details.
|
|
|
| [What are input IDs?](../glossary#input-ids)
|
| attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
|
| - 1 for tokens that are **not masked**,
|
| - 0 for tokens that are **masked**.
|
|
|
| [What are attention masks?](../glossary#attention-mask)
|
| head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
|
| Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
|
|
|
| - 1 indicates the head is **not masked**,
|
| - 0 indicates the head is **masked**.
|
|
|
| past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
| Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
| shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
|
|
|
| Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
|
| cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
|
|
| If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
|
| that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
|
| all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
|
|
| inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
| Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
| This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
| than the model's internal embedding lookup matrix.
|
| output_attentions (`bool`, *optional*):
|
| Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
| returned tensors for more detail.
|
| output_hidden_states (`bool`, *optional*):
|
| Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
| for more detail.
|
| return_dict (`bool`, *optional*):
|
| Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| """
|
| output_attentions = (
|
| output_attentions
|
| if output_attentions is not None
|
| else self.config.output_attentions
|
| )
|
| output_hidden_states = (
|
| output_hidden_states
|
| if output_hidden_states is not None
|
| else self.config.output_hidden_states
|
| )
|
| use_cache = use_cache if use_cache is not None else self.config.use_cache
|
|
|
| return_dict = (
|
| return_dict if return_dict is not None else self.config.use_return_dict
|
| )
|
|
|
|
|
| if input_ids is not None and inputs_embeds is not None:
|
| raise ValueError(
|
| "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
|
| )
|
| elif input_ids is not None:
|
| input_shape = input_ids.size()
|
| input_ids = input_ids.view(-1, input_shape[-1])
|
| elif inputs_embeds is not None:
|
| input_shape = inputs_embeds.size()[:-1]
|
| else:
|
| raise ValueError(
|
| "You have to specify either decoder_input_ids or decoder_inputs_embeds"
|
| )
|
|
|
| past_key_values_length = (
|
| past_key_values[0][0].shape[2] if past_key_values is not None else 0
|
| )
|
|
|
| if inputs_embeds is None:
|
| inputs_embeds = self.embed_tokens(input_ids)
|
|
|
| if query_embeds is not None:
|
| inputs_embeds = torch.cat([query_embeds, inputs_embeds], dim=1)
|
| input_shape = inputs_embeds.size()[:-1]
|
|
|
|
|
| if attention_mask is None:
|
| attention_mask = torch.ones(
|
| inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device
|
| )
|
| pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
|
|
|
| attention_mask = self._prepare_decoder_attention_mask(
|
| attention_mask, input_shape, inputs_embeds, past_key_values_length
|
| )
|
|
|
| if self.project_in is not None:
|
| inputs_embeds = self.project_in(inputs_embeds)
|
|
|
| hidden_states = inputs_embeds + pos_embeds
|
|
|
|
|
| all_hidden_states = () if output_hidden_states else None
|
| all_self_attns = () if output_attentions else None
|
| next_decoder_cache = () if use_cache else None
|
|
|
|
|
| for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
|
| if attn_mask is not None:
|
| if attn_mask.size()[0] != (len(self.layers)):
|
| raise ValueError(
|
| f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
|
| f" {head_mask.size()[0]}."
|
| )
|
|
|
| for idx, decoder_layer in enumerate(self.layers):
|
|
|
| if output_hidden_states:
|
| all_hidden_states += (hidden_states,)
|
|
|
| dropout_probability = random.uniform(0, 1)
|
| if self.training and (dropout_probability < self.layerdrop):
|
| continue
|
|
|
| past_key_value = (
|
| past_key_values[idx] if past_key_values is not None else None
|
| )
|
|
|
| if self.gradient_checkpointing and self.training:
|
|
|
| if use_cache:
|
| logger.warning(
|
| "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
| )
|
| use_cache = False
|
|
|
| def create_custom_forward(module):
|
| def custom_forward(*inputs):
|
|
|
| return module(*inputs, output_attentions, None)
|
|
|
| return custom_forward
|
|
|
| layer_outputs = torch.utils.checkpoint.checkpoint(
|
| create_custom_forward(decoder_layer),
|
| hidden_states,
|
| attention_mask,
|
| head_mask[idx] if head_mask is not None else None,
|
| None,
|
| )
|
| else:
|
|
|
| layer_outputs = decoder_layer(
|
| hidden_states,
|
| attention_mask=attention_mask,
|
| layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
| past_key_value=past_key_value,
|
| output_attentions=output_attentions,
|
| use_cache=use_cache,
|
| )
|
|
|
| hidden_states = layer_outputs[0]
|
|
|
| if use_cache:
|
| next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
|
|
|
| if output_attentions:
|
| all_self_attns += (layer_outputs[1],)
|
|
|
| if self.final_layer_norm is not None:
|
| hidden_states = self.final_layer_norm(hidden_states)
|
|
|
| if self.project_out is not None:
|
| hidden_states = self.project_out(hidden_states)
|
|
|
|
|
| if output_hidden_states:
|
| all_hidden_states += (hidden_states,)
|
|
|
| next_cache = next_decoder_cache if use_cache else None
|
| if not return_dict:
|
| return tuple(
|
| v
|
| for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
|
| if v is not None
|
| )
|
| return BaseModelOutputWithPast(
|
| last_hidden_state=hidden_states,
|
| past_key_values=next_cache,
|
| hidden_states=all_hidden_states,
|
| attentions=all_self_attns,
|
| )
|
|
|
|
|
| @add_start_docstrings(
|
| "The bare OPT Model outputting raw hidden-states without any specific head on top.",
|
| OPT_START_DOCSTRING,
|
| )
|
| class OPTModel(OPTPreTrainedModel):
|
| def __init__(self, config: OPTConfig):
|
| super().__init__(config)
|
| self.decoder = OPTDecoder(config)
|
|
|
| self.post_init()
|
|
|
| def get_input_embeddings(self):
|
| return self.decoder.embed_tokens
|
|
|
| def set_input_embeddings(self, value):
|
| self.decoder.embed_tokens = value
|
|
|
| def get_decoder(self):
|
| return self.decoder
|
|
|
| @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
|
| @add_code_sample_docstrings(
|
| processor_class=_TOKENIZER_FOR_DOC,
|
| checkpoint=_CHECKPOINT_FOR_DOC,
|
| output_type=BaseModelOutputWithPast,
|
| config_class=_CONFIG_FOR_DOC,
|
| expected_output=_EXPECTED_OUTPUT_SHAPE,
|
| )
|
| def forward(
|
| self,
|
| input_ids: torch.LongTensor = None,
|
| attention_mask: Optional[torch.Tensor] = None,
|
| head_mask: Optional[torch.Tensor] = None,
|
| past_key_values: Optional[List[torch.FloatTensor]] = None,
|
| inputs_embeds: Optional[torch.FloatTensor] = None,
|
| query_embeds: Optional[torch.FloatTensor] = None,
|
| use_cache: Optional[bool] = None,
|
| output_attentions: Optional[bool] = None,
|
| output_hidden_states: Optional[bool] = None,
|
| return_dict: Optional[bool] = None,
|
| ) -> Union[Tuple, BaseModelOutputWithPast]:
|
|
|
| output_attentions = (
|
| output_attentions
|
| if output_attentions is not None
|
| else self.config.output_attentions
|
| )
|
| output_hidden_states = (
|
| output_hidden_states
|
| if output_hidden_states is not None
|
| else self.config.output_hidden_states
|
| )
|
| use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| return_dict = (
|
| return_dict if return_dict is not None else self.config.use_return_dict
|
| )
|
|
|
|
|
| decoder_outputs = self.decoder(
|
| input_ids=input_ids,
|
| attention_mask=attention_mask,
|
| head_mask=head_mask,
|
| past_key_values=past_key_values,
|
| inputs_embeds=inputs_embeds,
|
| query_embeds=query_embeds,
|
| use_cache=use_cache,
|
| output_attentions=output_attentions,
|
| output_hidden_states=output_hidden_states,
|
| return_dict=return_dict,
|
| )
|
|
|
| if not return_dict:
|
| return decoder_outputs
|
|
|
| return BaseModelOutputWithPast(
|
| last_hidden_state=decoder_outputs.last_hidden_state,
|
| past_key_values=decoder_outputs.past_key_values,
|
| hidden_states=decoder_outputs.hidden_states,
|
| attentions=decoder_outputs.attentions,
|
| )
|
|
|
|
|
| class OPTForCausalLM(OPTPreTrainedModel):
|
| _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
|
|
|
| def __init__(self, config):
|
| super().__init__(config)
|
| self.model = OPTModel(config)
|
|
|
|
|
| self.lm_head = nn.Linear(
|
| config.word_embed_proj_dim, config.vocab_size, bias=False
|
| )
|
|
|
|
|
| self.post_init()
|
|
|
| def get_input_embeddings(self):
|
| return self.model.decoder.embed_tokens
|
|
|
| def set_input_embeddings(self, value):
|
| self.model.decoder.embed_tokens = value
|
|
|
| def get_output_embeddings(self):
|
| return self.lm_head
|
|
|
| def set_output_embeddings(self, new_embeddings):
|
| self.lm_head = new_embeddings
|
|
|
| def set_decoder(self, decoder):
|
| self.model.decoder = decoder
|
|
|
| def get_decoder(self):
|
| return self.model.decoder
|
|
|
| @replace_return_docstrings(
|
| output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
| )
|
| def forward(
|
| self,
|
| input_ids: torch.LongTensor = None,
|
| attention_mask: Optional[torch.Tensor] = None,
|
| head_mask: Optional[torch.Tensor] = None,
|
| past_key_values: Optional[List[torch.FloatTensor]] = None,
|
| inputs_embeds: Optional[torch.FloatTensor] = None,
|
| query_embeds: Optional[torch.FloatTensor] = None,
|
| labels: Optional[torch.LongTensor] = None,
|
| use_cache: Optional[bool] = None,
|
| output_attentions: Optional[bool] = None,
|
| output_hidden_states: Optional[bool] = None,
|
| return_dict: Optional[bool] = None,
|
| reduction: Optional[str] = "mean",
|
| ) -> Union[Tuple, CausalLMOutputWithPast]:
|
| r"""
|
| Args:
|
| input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
|
| provide it.
|
|
|
| Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| [`PreTrainedTokenizer.__call__`] for details.
|
|
|
| [What are input IDs?](../glossary#input-ids)
|
| attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
|
| - 1 for tokens that are **not masked**,
|
| - 0 for tokens that are **masked**.
|
|
|
| [What are attention masks?](../glossary#attention-mask)
|
| head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
|
| Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
|
|
|
| - 1 indicates the head is **not masked**,
|
| - 0 indicates the head is **masked**.
|
|
|
| past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
| Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
| shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
|
| shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
|
| tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
|
|
|
| Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
|
| cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
|
|
| If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
|
| that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
|
| all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
| inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
| Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
| This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
| than the model's internal embedding lookup matrix.
|
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
| config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
| (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
| use_cache (`bool`, *optional*):
|
| If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
| (see `past_key_values`).
|
| output_attentions (`bool`, *optional*):
|
| Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
| returned tensors for more detail.
|
| output_hidden_states (`bool`, *optional*):
|
| Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
| for more detail.
|
| return_dict (`bool`, *optional*):
|
| Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
|
|
| Returns:
|
|
|
| Example:
|
|
|
| ```python
|
| >>> from transformers import GPT2Tokenizer, OPTForCausalLM
|
|
|
| >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
|
| >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
|
|
|
| >>> prompt = "Hey, are you consciours? Can you talk to me?"
|
| >>> inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
| >>> # Generate
|
| >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
| >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
| "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
|
| ```"""
|
|
|
| output_attentions = (
|
| output_attentions
|
| if output_attentions is not None
|
| else self.config.output_attentions
|
| )
|
| output_hidden_states = (
|
| output_hidden_states
|
| if output_hidden_states is not None
|
| else self.config.output_hidden_states
|
| )
|
| return_dict = (
|
| return_dict if return_dict is not None else self.config.use_return_dict
|
| )
|
|
|
|
|
| outputs = self.model.decoder(
|
| input_ids=input_ids,
|
| attention_mask=attention_mask,
|
| head_mask=head_mask,
|
| past_key_values=past_key_values,
|
| inputs_embeds=inputs_embeds,
|
| query_embeds=query_embeds,
|
| use_cache=use_cache,
|
| output_attentions=output_attentions,
|
| output_hidden_states=output_hidden_states,
|
| return_dict=return_dict,
|
| )
|
|
|
| logits = self.lm_head(outputs[0]).contiguous()
|
|
|
| loss = None
|
| if labels is not None:
|
| logits = logits[:, -labels.size(1) :, :]
|
|
|
|
|
| shift_logits = logits[..., :-1, :].contiguous()
|
| shift_labels = labels[..., 1:].contiguous()
|
|
|
| loss_fct = CrossEntropyLoss(reduction=reduction)
|
| loss = loss_fct(
|
| shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1)
|
| )
|
| if reduction == "none":
|
| loss = loss.view(shift_logits.size(0), -1).sum(1)
|
|
|
| if not return_dict:
|
| output = (logits,) + outputs[1:]
|
| return (loss,) + output if loss is not None else output
|
|
|
| return CausalLMOutputWithPast(
|
| loss=loss,
|
| logits=logits,
|
| past_key_values=outputs.past_key_values,
|
| hidden_states=outputs.hidden_states,
|
| attentions=outputs.attentions,
|
| )
|
|
|
| def prepare_inputs_for_generation(
|
| self,
|
| input_ids=None,
|
| query_embeds=None,
|
| past=None,
|
| attention_mask=None,
|
| use_cache=None,
|
| **kwargs,
|
| ):
|
|
|
| if attention_mask is None:
|
| if input_ids is not None:
|
| attention_mask = input_ids.new_ones(input_ids.shape)
|
| if past:
|
| input_ids = input_ids[:, -1:]
|
| query_embeds = None
|
|
|
| return {
|
| "input_ids": input_ids,
|
| "query_embeds": query_embeds,
|
| "attention_mask": attention_mask,
|
| "past_key_values": past,
|
| "use_cache": use_cache,
|
| }
|
|
|
| @staticmethod
|
| def _reorder_cache(past, beam_idx):
|
| reordered_past = ()
|
| for layer_past in past:
|
| reordered_past += (
|
| tuple(
|
| past_state.index_select(0, beam_idx) for past_state in layer_past
|
| ),
|
| )
|
| return reordered_past
|
|
|