MRI / venv /lib /python3.13 /site-packages /transformers /modeling_layers.py

Add files using upload-large-folder tool

6f0b660 verified about 1 month ago

11.6 kB

	# Copyright 2025 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	from functools import partial
	from typing import Optional

	import torch
	import torch.nn as nn

	from .cache_utils import Cache
	from .modeling_outputs import (
	BaseModelOutputWithPast,
	QuestionAnsweringModelOutput,
	SequenceClassifierOutputWithPast,
	TokenClassifierOutput,
	)
	from .models.auto import AutoModel
	from .processing_utils import Unpack
	from .utils import TransformersKwargs, auto_docstring, can_return_tuple, logging


	logger = logging.get_logger(__name__)


	class GradientCheckpointingLayer(nn.Module):
	"""Base class for layers with gradient checkpointing.

	This class enables gradient checkpointing functionality for a layer. By default, gradient checkpointing is disabled
	(`gradient_checkpointing = False`). When `model.set_gradient_checkpointing()` is called, gradient checkpointing is
	enabled by setting `gradient_checkpointing = True` and assigning a checkpointing function to `_gradient_checkpointing_func`.

	Important:

	When using gradient checkpointing with `use_reentrant=True`, inputs that require gradients (e.g. hidden states)
	must be passed as positional arguments (`*args`) rather than keyword arguments to properly propagate gradients.

	Example:

	```python
	>>> # Correct - hidden_states passed as positional arg
	>>> out = self.layer(hidden_states, attention_mask=attention_mask)

	>>> # Incorrect - hidden_states passed as keyword arg
	>>> out = self.layer(hidden_states=hidden_states, attention_mask=attention_mask)
	```
	"""

	gradient_checkpointing = False

	def __call__(self, args, *kwargs):
	if self.gradient_checkpointing and self.training:
	do_warn = False
	layer_name = self.__class__.__name__
	message = f"Caching is incompatible with gradient checkpointing in {layer_name}. Setting"

	if "use_cache" in kwargs and kwargs["use_cache"]:
	kwargs["use_cache"] = False
	message += " `use_cache=False`,"
	do_warn = True

	# different names for the same thing in different layers
	# TODO cyril: this one without `S` can be removed after deprection cycle
	if "past_key_value" in kwargs and kwargs["past_key_value"] is not None:
	kwargs["past_key_value"] = None
	message += " `past_key_value=None`,"
	do_warn = True

	if "past_key_values" in kwargs and kwargs["past_key_values"] is not None:
	kwargs["past_key_values"] = None
	message += " `past_key_values=None`,"
	do_warn = True

	if "layer_past" in kwargs and kwargs["layer_past"] is not None:
	kwargs["layer_past"] = None
	message += " `layer_past=None`,"
	do_warn = True

	# warn if anything was changed
	if do_warn:
	message = message.rstrip(",") + "."
	logger.warning_once(message)

	return self._gradient_checkpointing_func(partial(super().__call__, *kwargs), args)
	return super().__call__(args, *kwargs)


	@auto_docstring
	class GenericForSequenceClassification:
	base_model_prefix = "model"

	def __init__(self, config):
	super().__init__(config)
	self.num_labels = config.num_labels
	# Similar to `self.model = AutoModel.from_config(config)` but allows to change the base model name if needed in the child class
	setattr(self, self.base_model_prefix, AutoModel.from_config(config))
	self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> SequenceClassifierOutputWithPast:
	transformer_outputs: BaseModelOutputWithPast = getattr(self, self.base_model_prefix)(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	**kwargs,
	)
	hidden_states = transformer_outputs.last_hidden_state
	logits = self.score(hidden_states)

	if input_ids is not None:
	batch_size = input_ids.shape[0]
	else:
	batch_size = inputs_embeds.shape[0]

	if self.config.pad_token_id is None and batch_size != 1:
	raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
	if self.config.pad_token_id is None:
	last_non_pad_token = -1
	elif input_ids is not None:
	# To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
	non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
	token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
	last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
	else:
	last_non_pad_token = -1
	logger.warning_once(
	f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
	"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
	)

	pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]

	loss = None
	if labels is not None:
	loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

	return SequenceClassifierOutputWithPast(
	loss=loss,
	logits=pooled_logits,
	past_key_values=transformer_outputs.past_key_values,
	hidden_states=transformer_outputs.hidden_states,
	attentions=transformer_outputs.attentions,
	)


	@auto_docstring
	class GenericForQuestionAnswering:
	base_model_prefix = "model"

	def __init__(self, config):
	super().__init__(config)
	# Similar to `self.model = AutoModel.from_config(config)` but allows to change the base model name if needed in the child class
	setattr(self, self.base_model_prefix, AutoModel.from_config(config))
	self.qa_outputs = nn.Linear(config.hidden_size, 2)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return getattr(self, self.base_model_prefix).embed_tokens

	def set_input_embeddings(self, value):
	getattr(self, self.base_model_prefix).embed_tokens = value

	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	start_positions: Optional[torch.LongTensor] = None,
	end_positions: Optional[torch.LongTensor] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> QuestionAnsweringModelOutput:
	outputs: BaseModelOutputWithPast = getattr(self, self.base_model_prefix)(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	**kwargs,
	)

	sequence_output = outputs.last_hidden_state

	logits = self.qa_outputs(sequence_output)
	start_logits, end_logits = logits.split(1, dim=-1)
	start_logits = start_logits.squeeze(-1).contiguous()
	end_logits = end_logits.squeeze(-1).contiguous()

	loss = None
	if start_positions is not None and end_positions is not None:
	loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)

	return QuestionAnsweringModelOutput(
	loss=loss,
	start_logits=start_logits,
	end_logits=end_logits,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)


	@auto_docstring
	class GenericForTokenClassification:
	base_model_prefix = "model"

	def __init__(self, config):
	super().__init__(config)
	self.num_labels = config.num_labels
	# Similar to `self.model = AutoModel.from_config(config)` but allows to change the base model name if needed in the child class
	setattr(self, self.base_model_prefix, AutoModel.from_config(config))
	if getattr(config, "classifier_dropout", None) is not None:
	classifier_dropout = config.classifier_dropout
	elif getattr(config, "hidden_dropout", None) is not None:
	classifier_dropout = config.hidden_dropout
	else:
	classifier_dropout = 0.1
	self.dropout = nn.Dropout(classifier_dropout)
	self.score = nn.Linear(config.hidden_size, config.num_labels)

	# Initialize weights and apply final processing
	self.post_init()

	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> TokenClassifierOutput:
	outputs: BaseModelOutputWithPast = getattr(self, self.base_model_prefix)(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	**kwargs,
	)
	sequence_output = outputs.last_hidden_state
	sequence_output = self.dropout(sequence_output)
	logits = self.score(sequence_output)

	loss = None
	if labels is not None:
	loss = self.loss_function(logits, labels, self.config)

	return TokenClassifierOutput(
	loss=loss,
	logits=logits,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)