|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from functools import partial |
|
|
from typing import Optional |
|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
|
|
|
from .cache_utils import Cache |
|
|
from .modeling_outputs import ( |
|
|
BaseModelOutputWithPast, |
|
|
QuestionAnsweringModelOutput, |
|
|
SequenceClassifierOutputWithPast, |
|
|
TokenClassifierOutput, |
|
|
) |
|
|
from .models.auto import AutoModel |
|
|
from .processing_utils import Unpack |
|
|
from .utils import TransformersKwargs, auto_docstring, can_return_tuple, logging |
|
|
|
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
|
|
|
class GradientCheckpointingLayer(nn.Module): |
|
|
"""Base class for layers with gradient checkpointing. |
|
|
|
|
|
This class enables gradient checkpointing functionality for a layer. By default, gradient checkpointing is disabled |
|
|
(`gradient_checkpointing = False`). When `model.set_gradient_checkpointing()` is called, gradient checkpointing is |
|
|
enabled by setting `gradient_checkpointing = True` and assigning a checkpointing function to `_gradient_checkpointing_func`. |
|
|
|
|
|
Important: |
|
|
|
|
|
When using gradient checkpointing with `use_reentrant=True`, inputs that require gradients (e.g. hidden states) |
|
|
must be passed as positional arguments (`*args`) rather than keyword arguments to properly propagate gradients. |
|
|
|
|
|
Example: |
|
|
|
|
|
```python |
|
|
>>> # Correct - hidden_states passed as positional arg |
|
|
>>> out = self.layer(hidden_states, attention_mask=attention_mask) |
|
|
|
|
|
>>> # Incorrect - hidden_states passed as keyword arg |
|
|
>>> out = self.layer(hidden_states=hidden_states, attention_mask=attention_mask) |
|
|
``` |
|
|
""" |
|
|
|
|
|
gradient_checkpointing = False |
|
|
|
|
|
def __call__(self, *args, **kwargs): |
|
|
if self.gradient_checkpointing and self.training: |
|
|
do_warn = False |
|
|
layer_name = self.__class__.__name__ |
|
|
message = f"Caching is incompatible with gradient checkpointing in {layer_name}. Setting" |
|
|
|
|
|
if "use_cache" in kwargs and kwargs["use_cache"]: |
|
|
kwargs["use_cache"] = False |
|
|
message += " `use_cache=False`," |
|
|
do_warn = True |
|
|
|
|
|
|
|
|
|
|
|
if "past_key_value" in kwargs and kwargs["past_key_value"] is not None: |
|
|
kwargs["past_key_value"] = None |
|
|
message += " `past_key_value=None`," |
|
|
do_warn = True |
|
|
|
|
|
if "past_key_values" in kwargs and kwargs["past_key_values"] is not None: |
|
|
kwargs["past_key_values"] = None |
|
|
message += " `past_key_values=None`," |
|
|
do_warn = True |
|
|
|
|
|
if "layer_past" in kwargs and kwargs["layer_past"] is not None: |
|
|
kwargs["layer_past"] = None |
|
|
message += " `layer_past=None`," |
|
|
do_warn = True |
|
|
|
|
|
|
|
|
if do_warn: |
|
|
message = message.rstrip(",") + "." |
|
|
logger.warning_once(message) |
|
|
|
|
|
return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args) |
|
|
return super().__call__(*args, **kwargs) |
|
|
|
|
|
|
|
|
@auto_docstring |
|
|
class GenericForSequenceClassification: |
|
|
base_model_prefix = "model" |
|
|
|
|
|
def __init__(self, config): |
|
|
super().__init__(config) |
|
|
self.num_labels = config.num_labels |
|
|
|
|
|
setattr(self, self.base_model_prefix, AutoModel.from_config(config)) |
|
|
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) |
|
|
|
|
|
|
|
|
self.post_init() |
|
|
|
|
|
@can_return_tuple |
|
|
@auto_docstring |
|
|
def forward( |
|
|
self, |
|
|
input_ids: Optional[torch.LongTensor] = None, |
|
|
attention_mask: Optional[torch.Tensor] = None, |
|
|
position_ids: Optional[torch.LongTensor] = None, |
|
|
past_key_values: Optional[Cache] = None, |
|
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
|
labels: Optional[torch.LongTensor] = None, |
|
|
use_cache: Optional[bool] = None, |
|
|
**kwargs: Unpack[TransformersKwargs], |
|
|
) -> SequenceClassifierOutputWithPast: |
|
|
transformer_outputs: BaseModelOutputWithPast = getattr(self, self.base_model_prefix)( |
|
|
input_ids, |
|
|
attention_mask=attention_mask, |
|
|
position_ids=position_ids, |
|
|
past_key_values=past_key_values, |
|
|
inputs_embeds=inputs_embeds, |
|
|
use_cache=use_cache, |
|
|
**kwargs, |
|
|
) |
|
|
hidden_states = transformer_outputs.last_hidden_state |
|
|
logits = self.score(hidden_states) |
|
|
|
|
|
if input_ids is not None: |
|
|
batch_size = input_ids.shape[0] |
|
|
else: |
|
|
batch_size = inputs_embeds.shape[0] |
|
|
|
|
|
if self.config.pad_token_id is None and batch_size != 1: |
|
|
raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") |
|
|
if self.config.pad_token_id is None: |
|
|
last_non_pad_token = -1 |
|
|
elif input_ids is not None: |
|
|
|
|
|
non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) |
|
|
token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) |
|
|
last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) |
|
|
else: |
|
|
last_non_pad_token = -1 |
|
|
logger.warning_once( |
|
|
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " |
|
|
"unexpected if using padding tokens in conjunction with `inputs_embeds.`" |
|
|
) |
|
|
|
|
|
pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] |
|
|
|
|
|
loss = None |
|
|
if labels is not None: |
|
|
loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) |
|
|
|
|
|
return SequenceClassifierOutputWithPast( |
|
|
loss=loss, |
|
|
logits=pooled_logits, |
|
|
past_key_values=transformer_outputs.past_key_values, |
|
|
hidden_states=transformer_outputs.hidden_states, |
|
|
attentions=transformer_outputs.attentions, |
|
|
) |
|
|
|
|
|
|
|
|
@auto_docstring |
|
|
class GenericForQuestionAnswering: |
|
|
base_model_prefix = "model" |
|
|
|
|
|
def __init__(self, config): |
|
|
super().__init__(config) |
|
|
|
|
|
setattr(self, self.base_model_prefix, AutoModel.from_config(config)) |
|
|
self.qa_outputs = nn.Linear(config.hidden_size, 2) |
|
|
|
|
|
|
|
|
self.post_init() |
|
|
|
|
|
def get_input_embeddings(self): |
|
|
return getattr(self, self.base_model_prefix).embed_tokens |
|
|
|
|
|
def set_input_embeddings(self, value): |
|
|
getattr(self, self.base_model_prefix).embed_tokens = value |
|
|
|
|
|
@can_return_tuple |
|
|
@auto_docstring |
|
|
def forward( |
|
|
self, |
|
|
input_ids: Optional[torch.LongTensor] = None, |
|
|
attention_mask: Optional[torch.Tensor] = None, |
|
|
position_ids: Optional[torch.LongTensor] = None, |
|
|
past_key_values: Optional[Cache] = None, |
|
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
|
start_positions: Optional[torch.LongTensor] = None, |
|
|
end_positions: Optional[torch.LongTensor] = None, |
|
|
**kwargs: Unpack[TransformersKwargs], |
|
|
) -> QuestionAnsweringModelOutput: |
|
|
outputs: BaseModelOutputWithPast = getattr(self, self.base_model_prefix)( |
|
|
input_ids, |
|
|
attention_mask=attention_mask, |
|
|
position_ids=position_ids, |
|
|
past_key_values=past_key_values, |
|
|
inputs_embeds=inputs_embeds, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
sequence_output = outputs.last_hidden_state |
|
|
|
|
|
logits = self.qa_outputs(sequence_output) |
|
|
start_logits, end_logits = logits.split(1, dim=-1) |
|
|
start_logits = start_logits.squeeze(-1).contiguous() |
|
|
end_logits = end_logits.squeeze(-1).contiguous() |
|
|
|
|
|
loss = None |
|
|
if start_positions is not None and end_positions is not None: |
|
|
loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) |
|
|
|
|
|
return QuestionAnsweringModelOutput( |
|
|
loss=loss, |
|
|
start_logits=start_logits, |
|
|
end_logits=end_logits, |
|
|
hidden_states=outputs.hidden_states, |
|
|
attentions=outputs.attentions, |
|
|
) |
|
|
|
|
|
|
|
|
@auto_docstring |
|
|
class GenericForTokenClassification: |
|
|
base_model_prefix = "model" |
|
|
|
|
|
def __init__(self, config): |
|
|
super().__init__(config) |
|
|
self.num_labels = config.num_labels |
|
|
|
|
|
setattr(self, self.base_model_prefix, AutoModel.from_config(config)) |
|
|
if getattr(config, "classifier_dropout", None) is not None: |
|
|
classifier_dropout = config.classifier_dropout |
|
|
elif getattr(config, "hidden_dropout", None) is not None: |
|
|
classifier_dropout = config.hidden_dropout |
|
|
else: |
|
|
classifier_dropout = 0.1 |
|
|
self.dropout = nn.Dropout(classifier_dropout) |
|
|
self.score = nn.Linear(config.hidden_size, config.num_labels) |
|
|
|
|
|
|
|
|
self.post_init() |
|
|
|
|
|
@can_return_tuple |
|
|
@auto_docstring |
|
|
def forward( |
|
|
self, |
|
|
input_ids: Optional[torch.LongTensor] = None, |
|
|
attention_mask: Optional[torch.Tensor] = None, |
|
|
position_ids: Optional[torch.LongTensor] = None, |
|
|
past_key_values: Optional[Cache] = None, |
|
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
|
labels: Optional[torch.LongTensor] = None, |
|
|
use_cache: Optional[bool] = None, |
|
|
**kwargs: Unpack[TransformersKwargs], |
|
|
) -> TokenClassifierOutput: |
|
|
outputs: BaseModelOutputWithPast = getattr(self, self.base_model_prefix)( |
|
|
input_ids, |
|
|
attention_mask=attention_mask, |
|
|
position_ids=position_ids, |
|
|
past_key_values=past_key_values, |
|
|
inputs_embeds=inputs_embeds, |
|
|
use_cache=use_cache, |
|
|
**kwargs, |
|
|
) |
|
|
sequence_output = outputs.last_hidden_state |
|
|
sequence_output = self.dropout(sequence_output) |
|
|
logits = self.score(sequence_output) |
|
|
|
|
|
loss = None |
|
|
if labels is not None: |
|
|
loss = self.loss_function(logits, labels, self.config) |
|
|
|
|
|
return TokenClassifierOutput( |
|
|
loss=loss, |
|
|
logits=logits, |
|
|
hidden_states=outputs.hidden_states, |
|
|
attentions=outputs.attentions, |
|
|
) |
|
|
|