ltg
/

ltg-bert-bnc

@@ -1,4 +1,20 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
 import math
 from typing import List, Optional, Tuple, Union
@@ -6,10 +22,9 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch import _softmax_backward_data as _softmax_backward_data
 from torch.utils import checkpoint
-from configuration_ltgbert import LTGBertConfig
 from transformers.modeling_utils import PreTrainedModel
 from transformers.activations import gelu_new
 from transformers.modeling_outputs import (
@@ -20,6 +35,36 @@ from transformers.modeling_outputs import (
     TokenClassifierOutput,
     BaseModelOutput
 )
 class Encoder(nn.Module):
@@ -130,8 +175,8 @@ class MaskedSoftmax(torch.autograd.Function):
     @staticmethod
     def backward(self, grad_output):
         output, = self.saved_tensors
-        inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype)
-        return inputGrad, None, None
 class Attention(nn.Module):
@@ -195,25 +240,21 @@ class Attention(nn.Module):
         hidden_states = self.pre_layer_norm(hidden_states)
         query, key = self.in_proj_qk(hidden_states).chunk(2, dim=2)  # shape: [T, B, D]
-        key = key * self.scale
         value = self.in_proj_v(hidden_states)  # shape: [T, B, D]
         query = query.reshape(query_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
         key = key.reshape(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
         value = value.view(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
-        attention_scores = torch.bmm(query, key.transpose(1, 2))
         pos = self.in_proj_qk(self.dropout(relative_embedding))  # shape: [2T-1, 2D]
-        pos = pos.view(-1, self.num_heads, 2*self.head_size)
-        query_pos, key_pos = pos.chunk(2, dim=2)
-        key_pos = key_pos * self.scale
         query = query.view(batch_size, self.num_heads, query_len, self.head_size)
-        key = key.view(batch_size, self.num_heads, key_len, self.head_size)
-        attention_c_p = torch.einsum("bhqd,khd->bhqk", query, key_pos)
-        attention_p_c = torch.einsum("bhkd,qhd->bhqk", key, query_pos)
         position_indices = self.position_indices[:query_len, :key_len].expand(batch_size, self.num_heads, -1, -1)
         attention_c_p = attention_c_p.gather(3, position_indices)
@@ -223,7 +264,7 @@ class Attention(nn.Module):
         attention_scores.add_(attention_c_p)
         attention_scores.add_(attention_p_c)
-        return attention_scores, attention_c_p, attention_p_c, value
     def compute_output(self, attention_probs, value):
         attention_probs = self.dropout(attention_probs)
@@ -269,20 +310,65 @@ class Embedding(nn.Module):
 # HuggingFace wrappers
 #
-class LTGBertPreTrainedModel(PreTrainedModel):
-    config_class = LTGBertConfig
-    base_model_prefix = "LTG-BERT"
     supports_gradient_checkpointing = True
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Encoder):
             module.activation_checkpointing = value
-    def _init_weights(self, module):
         pass  # everything is already initialized
-class LTGBertModel(LTGBertPreTrainedModel):
     def __init__(self, config, add_mlm_layer=False):
         super().__init__(config)
         self.config = config
@@ -326,31 +412,40 @@ class LTGBertModel(LTGBertPreTrainedModel):
         ]
         return last_layer, contextualized_embeddings, attention_probs
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
         if not return_dict:
-            return sequence_output, contextualized_embeddings, attention_probs
         return BaseModelOutput(
             last_hidden_state=sequence_output,
-            hidden_states=contextualized_embeddings,
-            attentions=attention_probs
         )
-class LTGBertForMaskedLM(LTGBertModel):
     _keys_to_ignore_on_load_unexpected = ["head"]
     def __init__(self, config):
@@ -362,36 +457,44 @@ class LTGBertForMaskedLM(LTGBertModel):
     def set_output_embeddings(self, new_embeddings):
         self.classifier.nonlinearity[-1].weight = new_embeddings
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
         subword_prediction = self.classifier(sequence_output)
-        subword_prediction[:, :, :106+1] = float("-inf")
         masked_lm_loss = None
         if labels is not None:
             masked_lm_loss = F.cross_entropy(subword_prediction.flatten(0, 1), labels.flatten())
         if not return_dict:
-            output = (subword_prediction, contextualized_embeddings, attention_probs)
             return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
         return MaskedLMOutput(
             loss=masked_lm_loss,
             logits=subword_prediction,
-            hidden_states=contextualized_embeddings,
-            attentions=attention_probs
         )
@@ -399,8 +502,7 @@ class Classifier(nn.Module):
     def __init__(self, config, num_labels: int):
         super().__init__()
-        drop_out = getattr(config, "cls_dropout", None)
-        drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
         self.nonlinearity = nn.Sequential(
             nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
@@ -424,7 +526,14 @@ class Classifier(nn.Module):
         return x
-class LTGBertForSequenceClassification(LTGBertModel):
     _keys_to_ignore_on_load_unexpected = ["classifier"]
     _keys_to_ignore_on_load_missing = ["head"]
@@ -434,17 +543,22 @@ class LTGBertForSequenceClassification(LTGBertModel):
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
@@ -474,18 +588,29 @@ class LTGBertForSequenceClassification(LTGBertModel):
                 loss = loss_fct(logits, labels)
         if not return_dict:
-            output = (logits, contextualized_embeddings, attention_probs)
             return ((loss,) + output) if loss is not None else output
         return SequenceClassifierOutput(
             loss=loss,
             logits=logits,
-            hidden_states=contextualized_embeddings,
-            attentions=attention_probs
         )
-class LTGBertForTokenClassification(LTGBertModel):
     _keys_to_ignore_on_load_unexpected = ["classifier"]
     _keys_to_ignore_on_load_missing = ["head"]
@@ -495,6 +620,7 @@ class LTGBertForTokenClassification(LTGBertModel):
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -517,18 +643,29 @@ class LTGBertForTokenClassification(LTGBertModel):
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         if not return_dict:
-            output = (logits, contextualized_embeddings, attention_probs)
             return ((loss,) + output) if loss is not None else output
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
-            hidden_states=contextualized_embeddings,
-            attentions=attention_probs
         )
-class LTGBertForQuestionAnswering(LTGBertModel):
     _keys_to_ignore_on_load_unexpected = ["classifier"]
     _keys_to_ignore_on_load_missing = ["head"]
@@ -538,6 +675,7 @@ class LTGBertForQuestionAnswering(LTGBertModel):
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -578,19 +716,31 @@ class LTGBertForQuestionAnswering(LTGBertModel):
             total_loss = (start_loss + end_loss) / 2
         if not return_dict:
-            output = start_logits, end_logits, contextualized_embeddings, attention_probs
             return ((total_loss,) + output) if total_loss is not None else output
         return QuestionAnsweringModelOutput(
             loss=total_loss,
             start_logits=start_logits,
             end_logits=end_logits,
-            hidden_states=contextualized_embeddings,
-            attentions=attention_probs,
         )
-class LTGBertForMultipleChoice(LTGBertModel):
     _keys_to_ignore_on_load_unexpected = ["classifier"]
     _keys_to_ignore_on_load_missing = ["head"]
@@ -600,6 +750,7 @@ class LTGBertForMultipleChoice(LTGBertModel):
         self.num_labels = getattr(config, "num_labels", 2)
         self.head = Classifier(config, self.num_labels)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -607,9 +758,9 @@ class LTGBertForMultipleChoice(LTGBertModel):
         token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
-        return_dict: Optional[bool] = None,
-        start_positions: Optional[torch.Tensor] = None,
-        end_positions: Optional[torch.Tensor] = None
     ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         num_choices = input_ids.shape[1]
@@ -627,13 +778,16 @@ class LTGBertForMultipleChoice(LTGBertModel):
             loss = loss_fct(reshaped_logits, labels)
         if not return_dict:
-            output = (reshaped_logits, contextualized_embeddings, attention_probs)
             return ((loss,) + output) if loss is not None else output
         return MultipleChoiceModelOutput(
             loss=loss,
             logits=reshaped_logits,
-            hidden_states=contextualized_embeddings,
-            attentions=attention_probs,
         )

+# coding=utf-8
+# Copyright 2023 Language Technology Group from University of Oslo and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LTG-BERT model."""
 import math
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils import checkpoint
+from configuration_ltgbert import LtgBertConfig
 from transformers.modeling_utils import PreTrainedModel
 from transformers.activations import gelu_new
 from transformers.modeling_outputs import (
     TokenClassifierOutput,
     BaseModelOutput
 )
+from transformers.pytorch_utils import softmax_backward_data
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
+_CHECKPOINT_FOR_DOC = "ltg/bnc-bert-span"
+_CONFIG_FOR_DOC = "LtgBertConfig"
+LTG_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bnc-bert-span",
+    "bnc-bert-span-2x",
+    "bnc-bert-span-0.5x",
+    "bnc-bert-span-0.25x",
+    "bnc-bert-span-order",
+    "bnc-bert-span-document",
+    "bnc-bert-span-word",
+    "bnc-bert-span-subword",
+    "norbert3-xs",
+    "norbert3-small",
+    "norbert3-base",
+    "norbert3-large",
+    "norbert3-oversampled-base",
+    "norbert3-ncc-base",
+    "norbert3-nak-base",
+    "norbert3-nb-base",
+    "norbert3-wiki-base",
+    "norbert3-c4-base"
+]
 class Encoder(nn.Module):
     @staticmethod
     def backward(self, grad_output):
         output, = self.saved_tensors
+        input_grad = softmax_backward_data(self, grad_output, output, self.dim, output)
+        return input_grad, None, None
 class Attention(nn.Module):
         hidden_states = self.pre_layer_norm(hidden_states)
         query, key = self.in_proj_qk(hidden_states).chunk(2, dim=2)  # shape: [T, B, D]
         value = self.in_proj_v(hidden_states)  # shape: [T, B, D]
         query = query.reshape(query_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
         key = key.reshape(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
         value = value.view(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
+        attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
         pos = self.in_proj_qk(self.dropout(relative_embedding))  # shape: [2T-1, 2D]
+        query_pos, key_pos = pos.view(-1, self.num_heads, 2*self.head_size).chunk(2, dim=2)
         query = query.view(batch_size, self.num_heads, query_len, self.head_size)
+        key = key.view(batch_size, self.num_heads, query_len, self.head_size)
+        attention_c_p = torch.einsum("bhqd,khd->bhqk", query, key_pos.squeeze(1) * self.scale)
+        attention_p_c = torch.einsum("bhkd,qhd->bhqk", key * self.scale, query_pos.squeeze(1))
         position_indices = self.position_indices[:query_len, :key_len].expand(batch_size, self.num_heads, -1, -1)
         attention_c_p = attention_c_p.gather(3, position_indices)
         attention_scores.add_(attention_c_p)
         attention_scores.add_(attention_p_c)
+        return attention_scores, value
     def compute_output(self, attention_probs, value):
         attention_probs = self.dropout(attention_probs)
 # HuggingFace wrappers
 #
+class LtgBertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = LtgBertConfig
+    base_model_prefix = "bnc-bert"
     supports_gradient_checkpointing = True
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Encoder):
             module.activation_checkpointing = value
+    def _init_weights(self, _):
         pass  # everything is already initialized
+LTG_BERT_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LtgBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+LTG_BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare LTG-BERT transformer outputting raw hidden-states without any specific head on top.",
+    LTG_BERT_START_DOCSTRING,
+)
+class LtgBertModel(LtgBertPreTrainedModel):
     def __init__(self, config, add_mlm_layer=False):
         super().__init__(config)
         self.config = config
         ]
         return last_layer, contextualized_embeddings, attention_probs
+    @add_start_docstrings_to_model_forward(LTG_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
         if not return_dict:
+            return (
+                sequence_output,
+                *([contextualized_embeddings] if output_hidden_states else []),
+                *([attention_probs] if output_attentions else [])
+            )
         return BaseModelOutput(
             last_hidden_state=sequence_output,
+            hidden_states=contextualized_embeddings if output_hidden_states else None,
+            attentions=attention_probs if output_attentions else None
         )
+@add_start_docstrings("""LTG-BERT model with a `language modeling` head on top.""", LTG_BERT_START_DOCSTRING)
+class LtgBertForMaskedLM(LtgBertModel):
     _keys_to_ignore_on_load_unexpected = ["head"]
     def __init__(self, config):
     def set_output_embeddings(self, new_embeddings):
         self.classifier.nonlinearity[-1].weight = new_embeddings
+    @add_start_docstrings_to_model_forward(LTG_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
         subword_prediction = self.classifier(sequence_output)
         masked_lm_loss = None
         if labels is not None:
             masked_lm_loss = F.cross_entropy(subword_prediction.flatten(0, 1), labels.flatten())
         if not return_dict:
+            output = (
+                subword_prediction,
+                *([contextualized_embeddings] if output_hidden_states else []),
+                *([attention_probs] if output_attentions else [])
+            )
             return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
         return MaskedLMOutput(
             loss=masked_lm_loss,
             logits=subword_prediction,
+            hidden_states=contextualized_embeddings if output_hidden_states else None,
+            attentions=attention_probs if output_attentions else None
         )
     def __init__(self, config, num_labels: int):
         super().__init__()
+        drop_out = getattr(config, "classifier_dropout", config.hidden_dropout_prob)
         self.nonlinearity = nn.Sequential(
             nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
         return x
+@add_start_docstrings(
+    """
+    LTG-BERT model with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    LTG_BERT_START_DOCSTRING,
+)
+class LtgBertForSequenceClassification(LtgBertModel):
     _keys_to_ignore_on_load_unexpected = ["classifier"]
     _keys_to_ignore_on_load_missing = ["head"]
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
+    @add_start_docstrings_to_model_forward(LTG_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
                 loss = loss_fct(logits, labels)
         if not return_dict:
+            output = (
+                logits,
+                *([contextualized_embeddings] if output_hidden_states else []),
+                *([attention_probs] if output_attentions else [])
+            )
             return ((loss,) + output) if loss is not None else output
         return SequenceClassifierOutput(
             loss=loss,
             logits=logits,
+            hidden_states=contextualized_embeddings if output_hidden_states else None,
+            attentions=attention_probs if output_attentions else None
         )
+@add_start_docstrings(
+    """
+    LTG-BERT model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    LTG_BERT_START_DOCSTRING,
+)
+class LtgBertForTokenClassification(LtgBertModel):
     _keys_to_ignore_on_load_unexpected = ["classifier"]
     _keys_to_ignore_on_load_missing = ["head"]
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
+    @add_start_docstrings_to_model_forward(LTG_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         if not return_dict:
+            output = (
+                logits,
+                *([contextualized_embeddings] if output_hidden_states else []),
+                *([attention_probs] if output_attentions else [])
+            )
             return ((loss,) + output) if loss is not None else output
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
+            hidden_states=contextualized_embeddings if output_hidden_states else None,
+            attentions=attention_probs if output_attentions else None
         )
+@add_start_docstrings(
+    """
+    LTG-BERT model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    LTG_BERT_START_DOCSTRING,
+)
+class LtgBertForQuestionAnswering(LtgBertModel):
     _keys_to_ignore_on_load_unexpected = ["classifier"]
     _keys_to_ignore_on_load_missing = ["head"]
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
+    @add_start_docstrings_to_model_forward(LTG_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
             total_loss = (start_loss + end_loss) / 2
         if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+                *([contextualized_embeddings] if output_hidden_states else []),
+                *([attention_probs] if output_attentions else [])
+            )
             return ((total_loss,) + output) if total_loss is not None else output
         return QuestionAnsweringModelOutput(
             loss=total_loss,
             start_logits=start_logits,
             end_logits=end_logits,
+            hidden_states=contextualized_embeddings if output_hidden_states else None,
+            attentions=attention_probs if output_attentions else None
         )
+@add_start_docstrings(
+    """
+    LTG-BERT model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    LTG_BERT_START_DOCSTRING,
+)
+class LtgBertForMultipleChoice(LtgBertModel):
     _keys_to_ignore_on_load_unexpected = ["classifier"]
     _keys_to_ignore_on_load_missing = ["head"]
         self.num_labels = getattr(config, "num_labels", 2)
         self.head = Classifier(config, self.num_labels)
+    @add_start_docstrings_to_model_forward(LTG_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None
     ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         num_choices = input_ids.shape[1]
             loss = loss_fct(reshaped_logits, labels)
         if not return_dict:
+            output = (
+                reshaped_logits,
+                *([contextualized_embeddings] if output_hidden_states else []),
+                *([attention_probs] if output_attentions else [])
+            )
             return ((loss,) + output) if loss is not None else output
         return MultipleChoiceModelOutput(
             loss=loss,
             logits=reshaped_logits,
+            hidden_states=contextualized_embeddings if output_hidden_states else None,
+            attentions=attention_probs if output_attentions else None
         )