Add files using upload-large-folder tool

6f0b660 verified 5 months ago

110 kB

	# Copyright 2020 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import warnings
	from dataclasses import dataclass
	from typing import Optional

	import torch

	from .cache_utils import Cache, EncoderDecoderCache
	from .utils import ModelOutput


	@dataclass
	class BaseModelOutput(ModelOutput):
	"""
	Base class for model's outputs, with potential hidden states and attentions.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class BaseModelOutputWithNoAttention(ModelOutput):
	"""
	Base class for model's outputs, with potential hidden states.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Sequence of hidden-states at the output of the last layer of the model.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class BaseModelOutputWithPooling(ModelOutput):
	"""
	Base class for model's outputs that also contains a pooling of the last hidden states.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
	Last layer hidden-state of the first token of the sequence (classification token) after further processing
	through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
	the classification token after processing through a linear layer and a tanh activation function. The linear
	layer weights are trained from the next sentence prediction (classification) objective during pretraining.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	pooler_output: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
	"""
	Base class for model's outputs that also contains a pooling of the last hidden states.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Sequence of hidden-states at the output of the last layer of the model.
	pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
	Last layer hidden-state after a pooling operation on the spatial dimensions.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	pooler_output: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class BaseModelOutputWithPast(ModelOutput):
	"""
	Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.

	If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
	hidden_size)` is output.
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
	`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
	input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class BaseModelOutputWithCrossAttentions(ModelOutput):
	"""
	Base class for model's outputs, with potential hidden states and attentions.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
	"""
	Base class for model's outputs that also contains a pooling of the last hidden states.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
	Last layer hidden-state of the first token of the sequence (classification token) after further processing
	through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
	the classification token after processing through a linear layer and a tanh activation function. The linear
	layer weights are trained from the next sentence prediction (classification) objective during pretraining.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
	`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
	input) to speed up sequential decoding.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	pooler_output: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	past_key_values: Optional[Cache] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
	"""
	Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.

	If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
	hidden_size)` is output.
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
	`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
	input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class MoECausalLMOutputWithPast(ModelOutput):
	"""
	Base class for causal language model (or autoregressive) outputs as well as Mixture of Expert's router hidden
	states terms, to train a MoE model.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	z_loss (`torch.FloatTensor`, optional, returned when `labels` is provided):
	z_loss for the sparse modules.
	aux_loss (`torch.FloatTensor`, optional, returned when `labels` is provided):
	aux_loss for the sparse modules.
	router_logits (`tuple(torch.FloatTensor)`, optional, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse
	modules.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	z_loss: Optional[torch.FloatTensor] = None
	aux_loss: Optional[torch.FloatTensor] = None
	router_logits: Optional[tuple[torch.FloatTensor]] = None


	@dataclass
	class MoEModelOutput(ModelOutput):
	"""
	Base class for model's outputs, with potential hidden states and attentions.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	router_probs (`tuple(torch.FloatTensor)`, optional, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Raw router probabilities that are computed by MoE routers, these terms are used to compute the auxiliary
	loss and the z_loss for Mixture of Experts models.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	router_probs: Optional[tuple[torch.FloatTensor]] = None


	@dataclass
	class MoeModelOutputWithPast(ModelOutput):
	"""
	Base class for model's outputs, with potential hidden states and attentions.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
	`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
	input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	router_logits (`tuple(torch.FloatTensor)`, optional, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
	loss for Mixture of Experts models.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	router_logits: Optional[tuple[torch.FloatTensor]] = None


	@dataclass
	class MoeCausalLMOutputWithPast(ModelOutput):
	"""
	Base class for causal language model (or autoregressive) with mixture of experts outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).

	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).

	aux_loss (`torch.FloatTensor`, optional, returned when `labels` is provided):
	aux_loss for the sparse modules.

	router_logits (`tuple(torch.FloatTensor)`, optional, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
	loss for Mixture of Experts models.

	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	aux_loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	router_logits: Optional[tuple[torch.FloatTensor]] = None


	@dataclass
	class MoEModelOutputWithPastAndCrossAttentions(ModelOutput):
	"""
	Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding) as well as
	Mixture of Expert's router hidden states terms, to train a MoE model.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.

	If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
	hidden_size)` is output.
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
	`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
	input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	router_probs (`tuple(torch.FloatTensor)`, optional, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Raw router probabilities that are computed by MoE routers, these terms are used to compute the auxiliary
	loss and the z_loss for Mixture of Experts models.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	router_probs: Optional[tuple[torch.FloatTensor]] = None


	@dataclass
	class Seq2SeqModelOutput(ModelOutput):
	"""
	Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
	decoding.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the decoder of the model.

	If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
	hidden_size)` is output.
	past_key_values (`EncoderDecoderCache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
	decoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
	decoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder of the model.
	encoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
	encoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	past_key_values: Optional[EncoderDecoderCache] = None
	decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_last_hidden_state: Optional[torch.FloatTensor] = None
	encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class Seq2SeqMoEModelOutput(ModelOutput):
	"""
	Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
	decoding.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the decoder of the model.

	If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
	hidden_size)` is output.
	past_key_values (`EncoderDecoderCache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
	decoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
	decoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	decoder_router_logits (`tuple(torch.FloatTensor)`, optional, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder of the model.
	encoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
	encoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	encoder_router_logits (`tuple(torch.FloatTensor)`, optional, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse
	modules.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	past_key_values: Optional[EncoderDecoderCache] = None
	decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_router_logits: Optional[tuple[torch.FloatTensor]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_last_hidden_state: Optional[torch.FloatTensor] = None
	encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_router_logits: Optional[tuple[torch.FloatTensor]] = None


	@dataclass
	class CausalLMOutput(ModelOutput):
	"""
	Base class for causal language model (or autoregressive) outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class CausalLMOutputWithPast(ModelOutput):
	"""
	Base class for causal language model (or autoregressive) outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class CausalLMOutputWithCrossAttentions(ModelOutput):
	"""
	Base class for causal language model (or autoregressive) outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Cross attentions weights after the attention softmax, used to compute the weighted average in the
	cross-attention heads.
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class SequenceClassifierOutputWithPast(ModelOutput):
	"""
	Base class for outputs of sentence classification models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Classification (or regression if config.num_labels==1) loss.
	logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
	Classification (or regression if config.num_labels==1) scores (before SoftMax).
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class MaskedLMOutput(ModelOutput):
	"""
	Base class for masked language models outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Masked language modeling (MLM) loss.
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class Seq2SeqLMOutput(ModelOutput):
	"""
	Base class for sequence-to-sequence language models outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss.
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past_key_values (`EncoderDecoderCache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
	decoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
	decoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder of the model.
	encoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
	encoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[EncoderDecoderCache] = None
	decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_last_hidden_state: Optional[torch.FloatTensor] = None
	encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class Seq2SeqMoEOutput(ModelOutput):
	"""
	Base class for sequence-to-sequence language models outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss.
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past_key_values (`EncoderDecoderCache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
	decoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
	decoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	decoder_router_logits (`tuple(torch.FloatTensor)`, optional, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder of the model.
	encoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
	encoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	encoder_router_logits (`tuple(torch.FloatTensor)`, optional, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Router logits of the encoder model, useful to compute the auxiliary loss and z_loss for Mixture of Experts
	models.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	encoder_z_loss: Optional[torch.FloatTensor] = None
	decoder_z_loss: Optional[torch.FloatTensor] = None
	encoder_aux_loss: Optional[torch.FloatTensor] = None
	decoder_aux_loss: Optional[torch.FloatTensor] = None
	past_key_values: Optional[EncoderDecoderCache] = None
	decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_router_logits: Optional[tuple[torch.FloatTensor]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_last_hidden_state: Optional[torch.FloatTensor] = None
	encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_router_logits: Optional[tuple[torch.FloatTensor]] = None


	@dataclass
	class NextSentencePredictorOutput(ModelOutput):
	"""
	Base class for outputs of models predicting if two sentences are consecutive or not.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `next_sentence_label` is provided):
	Next sequence prediction (classification) loss.
	logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
	Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
	before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class SequenceClassifierOutput(ModelOutput):
	"""
	Base class for outputs of sentence classification models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Classification (or regression if config.num_labels==1) loss.
	logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
	Classification (or regression if config.num_labels==1) scores (before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class Seq2SeqSequenceClassifierOutput(ModelOutput):
	"""
	Base class for outputs of sequence-to-sequence sentence classification models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `label` is provided):
	Classification (or regression if config.num_labels==1) loss.
	logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
	Classification (or regression if config.num_labels==1) scores (before SoftMax).
	past_key_values (`EncoderDecoderCache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
	decoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
	decoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder of the model.
	encoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
	encoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[EncoderDecoderCache] = None
	decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_last_hidden_state: Optional[torch.FloatTensor] = None
	encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class MultipleChoiceModelOutput(ModelOutput):
	"""
	Base class for outputs of multiple choice models.

	Args:
	loss (`torch.FloatTensor` of shape (1,), optional, returned when `labels` is provided):
	Classification loss.
	logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
	num_choices is the second dimension of the input tensors. (see input_ids above).

	Classification scores (before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class TokenClassifierOutput(ModelOutput):
	"""
	Base class for outputs of token classification models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided) :
	Classification loss.
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
	Classification scores (before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class QuestionAnsweringModelOutput(ModelOutput):
	"""
	Base class for outputs of question answering models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
	start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
	Span-start scores (before SoftMax).
	end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
	Span-end scores (before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	start_logits: Optional[torch.FloatTensor] = None
	end_logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
	"""
	Base class for outputs of sequence-to-sequence question answering models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
	start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
	Span-start scores (before SoftMax).
	end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
	Span-end scores (before SoftMax).
	past_key_values (`EncoderDecoderCache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
	decoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
	decoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder of the model.
	encoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
	encoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	start_logits: Optional[torch.FloatTensor] = None
	end_logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[EncoderDecoderCache] = None
	decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_last_hidden_state: Optional[torch.FloatTensor] = None
	encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class SemanticSegmenterOutput(ModelOutput):
	"""
	Base class for outputs of semantic segmentation models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Classification (or regression if config.num_labels==1) loss.
	logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
	Classification scores for each pixel.

	<Tip warning={true}>

	The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
	to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
	original image size as post-processing. You should always check your logits shape and resize as needed.

	</Tip>

	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class ImageClassifierOutput(ModelOutput):
	"""
	Base class for outputs of image classification models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Classification (or regression if config.num_labels==1) loss.
	logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
	Classification (or regression if config.num_labels==1) scores (before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
	(also called feature maps) of the model at the output of each stage.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class ImageClassifierOutputWithNoAttention(ModelOutput):
	"""
	Base class for outputs of image classification models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Classification (or regression if config.num_labels==1) loss.
	logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
	Classification (or regression if config.num_labels==1) scores (before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
	called feature maps) of the model at the output of each stage.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class DepthEstimatorOutput(ModelOutput):
	"""
	Base class for outputs of depth estimation models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Classification (or regression if config.num_labels==1) loss.
	predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
	Predicted depth for each pixel.

	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	predicted_depth: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class ImageSuperResolutionOutput(ModelOutput):
	"""
	Base class for outputs of image super resolution models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Reconstruction loss.
	reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Reconstructed images, possibly upscaled.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
	(also called feature maps) of the model at the output of each stage.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	reconstruction: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class Wav2Vec2BaseModelOutput(ModelOutput):
	"""
	Base class for models that have been trained with the Wav2Vec2 loss objective.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
	Sequence of extracted feature vectors of the last convolutional layer of the model.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
	shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	extract_features: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class XVectorOutput(ModelOutput):
	"""
	Output type of [`Wav2Vec2ForXVector`].

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Classification loss.
	logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
	Classification hidden states before AMSoftmax.
	embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
	Utterance embeddings used for vector similarity-based retrieval.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
	shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	embeddings: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class BackboneOutput(ModelOutput):
	"""
	Base class for outputs of backbones.

	Args:
	feature_maps (`tuple(torch.FloatTensor)` of shape `(batch_size, num_channels, height, width)`):
	Feature maps of the stages.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
	shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, num_channels, height, width)`,
	depending on the backbone.

	Hidden-states of the model at the output of each stage plus the initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`. Only applicable if the backbone uses attention.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	feature_maps: Optional[tuple[torch.FloatTensor]] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class BaseModelOutputWithPoolingAndProjection(ModelOutput):
	"""
	Base class for model's outputs that also contains a pooling of the last hidden states.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
	Last layer hidden-state of the first token of the sequence (classification token) after further processing
	through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
	the classification token after processing through a linear layer and a tanh activation function. The linear
	layer weights are trained from the next sentence prediction (classification) objective during pretraining.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	projection_state (`tuple(torch.FloatTensor)`, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` of shape `(batch_size,config.project_dim)`.

	Text embeddings before the projection layer, used to mimic the last hidden state of the teacher encoder.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	pooler_output: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	projection_state: Optional[tuple[torch.FloatTensor]] = None


	@dataclass
	class Seq2SeqSpectrogramOutput(ModelOutput):
	"""
	Base class for sequence-to-sequence spectrogram outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Spectrogram generation loss.
	spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
	The predicted spectrogram.
	past_key_values (`EncoderDecoderCache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
	decoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
	decoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder of the model.
	encoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
	encoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	spectrogram: Optional[torch.FloatTensor] = None
	past_key_values: Optional[EncoderDecoderCache] = None
	decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_last_hidden_state: Optional[torch.FloatTensor] = None
	encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None


	@dataclass
	class Seq2SeqTSModelOutput(ModelOutput):
	"""
	Base class for time series model's encoder outputs that also contains pre-computed hidden states that can speed up
	sequential decoding.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the decoder of the model.

	If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
	hidden_size)` is output.
	past_key_values (`EncoderDecoderCache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
	decoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
	decoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder of the model.
	encoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
	encoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, optional):
	Shift values of each time series' context window which is used to give the model inputs of the same
	magnitude and then used to shift back to the original magnitude.
	scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, optional):
	Scaling values of each time series' context window which is used to give the model inputs of the same
	magnitude and then used to rescale back to the original magnitude.
	static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, optional):
	Static features of each time series' in a batch which are copied to the covariates at inference time.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	past_key_values: Optional[EncoderDecoderCache] = None
	decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_last_hidden_state: Optional[torch.FloatTensor] = None
	encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	loc: Optional[torch.FloatTensor] = None
	scale: Optional[torch.FloatTensor] = None
	static_features: Optional[torch.FloatTensor] = None


	@dataclass
	class Seq2SeqTSPredictionOutput(ModelOutput):
	"""
	Base class for time series model's decoder outputs that also contain the loss as well as the parameters of the
	chosen distribution.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when a `future_values` is provided):
	Distributional loss.
	params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
	Parameters of the chosen distribution.
	past_key_values (`EncoderDecoderCache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
	blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
	decoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
	decoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	cross_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
	weighted average in the cross-attention heads.
	encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder of the model.
	encoder_hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
	encoder_attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
	self-attention heads.
	loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, optional):
	Shift values of each time series' context window which is used to give the model inputs of the same
	magnitude and then used to shift back to the original magnitude.
	scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, optional):
	Scaling values of each time series' context window which is used to give the model inputs of the same
	magnitude and then used to rescale back to the original magnitude.
	static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, optional):
	Static features of each time series' in a batch which are copied to the covariates at inference time.
	"""

	loss: Optional[torch.FloatTensor] = None
	params: Optional[tuple[torch.FloatTensor, ...]] = None
	past_key_values: Optional[EncoderDecoderCache] = None
	decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_last_hidden_state: Optional[torch.FloatTensor] = None
	encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
	loc: Optional[torch.FloatTensor] = None
	scale: Optional[torch.FloatTensor] = None
	static_features: Optional[torch.FloatTensor] = None


	@dataclass
	class SampleTSPredictionOutput(ModelOutput):
	"""
	Base class for time series model's predictions outputs that contains the sampled values from the chosen
	distribution.

	Args:
	sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, input_size)`):
	Sampled values from the chosen distribution.
	"""

	sequences: Optional[torch.FloatTensor] = None


	@dataclass
	class MaskedImageModelingOutput(ModelOutput):
	"""
	Base class for outputs of masked image completion / in-painting models.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `bool_masked_pos` is provided):
	Reconstruction loss.
	reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Reconstructed / completed images.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or
	when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
	(also called feature maps) of the model at the output of each stage.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when
	`config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
	sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
	the self-attention heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	reconstruction: Optional[torch.FloatTensor] = None
	hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[tuple[torch.FloatTensor, ...]] = None

	@property
	def logits(self):
	warnings.warn(
	"logits attribute is deprecated and will be removed in version 5 of Transformers."
	" Please use the reconstruction attribute to retrieve the final output instead.",
	FutureWarning,
	)
	return self.reconstruction