diff --git a/.gitattributes b/.gitattributes
index c16337ad34b115a91e34c1f71498c9d32bd70bb9..d77be6c455d46c96e4f44c3df49ba9f1a06640f5 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -433,3 +433,7 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/
.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
.venv/lib/python3.11/site-packages/transformers/__pycache__/testing_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/transformers/__pycache__/__init__.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modeling_wav2vec2_conformer.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3188ff24392e96cc499a53419f5db852d4357a37
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/__init__.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db4550dc6bc1e2a167f35d80dff7e798a3afa52c7d1dc8bade4f96cfb2f778ae
+size 272603
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cpm/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/cpm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaf4524671fdfe7aa54adab84bda0a4703b3b892
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/cpm/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .tokenization_cpm import *
+ from .tokenization_cpm_fast import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cpm/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/cpm/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b51560441e432901ce9a35ab27a1a9b6e7284aac
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/cpm/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e133e391fa0a1a43cede80811f4e2cb438a9952b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm_fast.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm_fast.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38404c912bd5f46aaa707a75b2475797f8afe73e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm_fast.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cpm/tokenization_cpm.py b/.venv/lib/python3.11/site-packages/transformers/models/cpm/tokenization_cpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..884068f1a1571037a1502c94c6dbb6ea5bdc22ae
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/cpm/tokenization_cpm.py
@@ -0,0 +1,348 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+import os
+import unicodedata
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import SPIECE_UNDERLINE, logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+
+class CpmTokenizer(PreTrainedTokenizer):
+ """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
+
+ vocab_files_names = VOCAB_FILES_NAMES
+
+ def __init__(
+ self,
+ vocab_file,
+ do_lower_case=False,
+ remove_space=True,
+ keep_accents=False,
+ bos_token="",
+ eos_token="",
+ unk_token="",
+ sep_token="",
+ pad_token="",
+ cls_token="",
+ mask_token="",
+ additional_special_tokens=["", ""],
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ **kwargs,
+ ) -> None:
+ """
+ Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
+ [SentencePiece](https://github.com/google/sentencepiece).
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should
+ refer to this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
+ contains the vocabulary necessary to instantiate a tokenizer.
+ do_lower_case (`bool`, *optional*, defaults to `True`):
+ Whether to lowercase the input when tokenizing.
+ remove_space (`bool`, *optional*, defaults to `True`):
+ Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+ keep_accents (`bool`, *optional*, defaults to `False`):
+ Whether to keep accents when tokenizing.
+ bos_token (`str`, *optional*, defaults to `""`):
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
+ token.
+
+
+
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
+ The end of sequence token.
+
+
+
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ unk_token (`str`, *optional*, defaults to `""`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
+ this token instead.
+ sep_token (`str`, *optional*, defaults to `""`):
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+ for sequence classification or for a text and a question for question answering. It is also used as the
+ last token of a sequence built with special tokens.
+ pad_token (`str`, *optional*, defaults to `""`):
+ The token used for padding, for example when batching sequences of different lengths.
+ cls_token (`str`, *optional*, defaults to `""`):
+ The classifier token which is used when doing sequence classification (classification of the whole
+ sequence instead of per-token classification). It is the first token of the sequence when built with
+ special tokens.
+ mask_token (`str`, *optional*, defaults to `""`):
+ The token used for masking values. This is the token used when training this model with masked language
+ modeling. This is the token which the model will try to predict.
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["", ""]`):
+ Additional special tokens used by the tokenizer.
+
+ Attributes:
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+ """
+ # Mask token behave like a normal word, i.e. include the space before it
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+ self.do_lower_case = do_lower_case
+ self.remove_space = remove_space
+ self.keep_accents = keep_accents
+ self.vocab_file = vocab_file
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
+ try:
+ import jieba
+ except ModuleNotFoundError as error:
+ raise error.__class__(
+ "You need to install jieba to use CpmTokenizer or CpmTokenizerFast. "
+ "See https://pypi.org/project/jieba/ for installation."
+ )
+ self.jieba = jieba
+ self.translator = str.maketrans(" \n", "\u2582\u2583")
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ remove_space=remove_space,
+ keep_accents=keep_accents,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ additional_special_tokens=additional_special_tokens,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
+ self._pad_token_type_id = 3
+
+ @property
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.vocab_size
+ def vocab_size(self):
+ return len(self.sp_model)
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.get_vocab
+ def get_vocab(self):
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__getstate__
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["sp_model"] = None
+ return state
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__setstate__
+ def __setstate__(self, d):
+ self.__dict__ = d
+
+ # for backward compatibility
+ if not hasattr(self, "sp_model_kwargs"):
+ self.sp_model_kwargs = {}
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(self.vocab_file)
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.preprocess_text
+ def preprocess_text(self, inputs):
+ if self.remove_space:
+ outputs = " ".join(inputs.strip().split())
+ else:
+ outputs = inputs
+ outputs = outputs.replace("``", '"').replace("''", '"')
+
+ if not self.keep_accents:
+ outputs = unicodedata.normalize("NFKD", outputs)
+ outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+ if self.do_lower_case:
+ outputs = outputs.lower()
+
+ return outputs
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._tokenize
+ def _tokenize(self, text: str) -> List[str]:
+ """Tokenize a string."""
+ text = self.preprocess_text(text)
+ pieces = self.sp_model.encode(text, out_type=str)
+ new_pieces = []
+ for piece in pieces:
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+ cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
+ if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+ if len(cur_pieces[0]) == 1:
+ cur_pieces = cur_pieces[1:]
+ else:
+ cur_pieces[0] = cur_pieces[0][1:]
+ cur_pieces.append(piece[-1])
+ new_pieces.extend(cur_pieces)
+ else:
+ new_pieces.append(piece)
+
+ return new_pieces
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._convert_token_to_id
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.PieceToId(token)
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._convert_id_to_token
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ return self.sp_model.IdToPiece(index)
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.convert_tokens_to_string
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+ return out_string
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.build_inputs_with_special_tokens
+ def build_inputs_with_special_tokens(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+ adding special tokens. An XLNet sequence has the following format:
+
+ - single sequence: `X `
+ - pair of sequences: `A B `
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs to which the special tokens will be added.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+ """
+ sep = [self.sep_token_id]
+ cls = [self.cls_token_id]
+ if token_ids_1 is None:
+ return token_ids_0 + sep + cls
+ return token_ids_0 + sep + token_ids_1 + sep + cls
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.get_special_tokens_mask
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ """
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` method.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is not None:
+ return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
+ return ([0] * len(token_ids_0)) + [1, 1]
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.create_token_type_ids_from_sequences
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
+ sequence pair mask has the following format:
+
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
+
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+ """
+ sep = [self.sep_token_id]
+ cls_segment_id = [2]
+
+ if token_ids_1 is None:
+ return len(token_ids_0 + sep) * [0] + cls_segment_id
+ return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.save_vocabulary
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def _decode(self, *args, **kwargs):
+ text = super()._decode(*args, **kwargs)
+ text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")
+ return text
+
+
+__all__ = ["CpmTokenizer"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cpm/tokenization_cpm_fast.py b/.venv/lib/python3.11/site-packages/transformers/models/cpm/tokenization_cpm_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef933e084ddb2b375df12cc27aa03a27de55db43
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/cpm/tokenization_cpm_fast.py
@@ -0,0 +1,241 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+from ...tokenization_utils_fast import AddedToken, PreTrainedTokenizerFast
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
+
+
+class CpmTokenizerFast(PreTrainedTokenizerFast):
+ """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
+
+ def __init__(
+ self,
+ vocab_file=None,
+ tokenizer_file=None,
+ do_lower_case=False,
+ remove_space=True,
+ keep_accents=False,
+ bos_token="",
+ eos_token="",
+ unk_token="",
+ sep_token="",
+ pad_token="",
+ cls_token="",
+ mask_token="",
+ additional_special_tokens=["", ""],
+ **kwargs,
+ ):
+ """
+ Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
+ [SentencePiece](https://github.com/google/sentencepiece).
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should
+ refer to this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
+ contains the vocabulary necessary to instantiate a tokenizer.
+ do_lower_case (`bool`, *optional*, defaults to `True`):
+ Whether to lowercase the input when tokenizing.
+ remove_space (`bool`, *optional*, defaults to `True`):
+ Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+ keep_accents (`bool`, *optional*, defaults to `False`):
+ Whether to keep accents when tokenizing.
+ bos_token (`str`, *optional*, defaults to `""`):
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
+ token.
+
+
+
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
+ The end of sequence token.
+
+
+
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ unk_token (`str`, *optional*, defaults to `""`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
+ this token instead.
+ sep_token (`str`, *optional*, defaults to `""`):
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+ for sequence classification or for a text and a question for question answering. It is also used as the
+ last token of a sequence built with special tokens.
+ pad_token (`str`, *optional*, defaults to `""`):
+ The token used for padding, for example when batching sequences of different lengths.
+ cls_token (`str`, *optional*, defaults to `""`):
+ The classifier token which is used when doing sequence classification (classification of the whole
+ sequence instead of per-token classification). It is the first token of the sequence when built with
+ special tokens.
+ mask_token (`str`, *optional*, defaults to `""`):
+ The token used for masking values. This is the token used when training this model with masked language
+ modeling. This is the token which the model will try to predict.
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["", ""]`):
+ Additional special tokens used by the tokenizer.
+
+ Attributes:
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+ """
+ # Mask token behave like a normal word, i.e. include the space before it
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+ super().__init__(
+ vocab_file=vocab_file,
+ tokenizer_file=tokenizer_file,
+ do_lower_case=do_lower_case,
+ remove_space=remove_space,
+ keep_accents=keep_accents,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ additional_special_tokens=additional_special_tokens,
+ **kwargs,
+ )
+
+ self._pad_token_type_id = 3
+ self.do_lower_case = do_lower_case
+ self.remove_space = remove_space
+ self.keep_accents = keep_accents
+ self.vocab_file = vocab_file
+
+ try:
+ import jieba
+ except ModuleNotFoundError as error:
+ raise error.__class__(
+ "You need to install jieba to use CpmTokenizer or CpmTokenizerFast. "
+ "See https://pypi.org/project/jieba/ for installation."
+ )
+ self.jieba = jieba
+ self.translator = str.maketrans(" \n", "\u2582\u2583")
+
+ @property
+ def can_save_slow_tokenizer(self) -> bool:
+ return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.build_inputs_with_special_tokens
+ def build_inputs_with_special_tokens(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+ adding special tokens. An XLNet sequence has the following format:
+
+ - single sequence: `X `
+ - pair of sequences: `A B `
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs to which the special tokens will be added.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+ """
+ sep = [self.sep_token_id]
+ cls = [self.cls_token_id]
+ if token_ids_1 is None:
+ return token_ids_0 + sep + cls
+ return token_ids_0 + sep + token_ids_1 + sep + cls
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.create_token_type_ids_from_sequences
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
+ sequence pair mask has the following format:
+
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
+
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+ """
+ sep = [self.sep_token_id]
+ cls_segment_id = [2]
+
+ if token_ids_1 is None:
+ return len(token_ids_0 + sep) * [0] + cls_segment_id
+ return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.save_vocabulary
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not self.can_save_slow_tokenizer:
+ raise ValueError(
+ "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+ "tokenizer."
+ )
+
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+
+ return (out_vocab_file,)
+
+ def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
+ batch_text_or_text_pairs = [
+ " ".join([x.translate(self.translator) for x in self.jieba.cut(text, cut_all=False)])
+ for text in batch_text_or_text_pairs
+ ]
+ return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
+
+ def _decode(self, *args, **kwargs):
+ text = super()._decode(*args, **kwargs)
+ text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")
+ return text
+
+
+__all__ = ["CpmTokenizerFast"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cvt/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..756aded9e6ad2964ffa5add89af2c4af56071e88
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_cvt import *
+ from .modeling_cvt import *
+ from .modeling_tf_cvt import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8640c405a3185c43d45a7dc34e7a8baf65ee175e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/configuration_cvt.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/configuration_cvt.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff930bac19a85405483927c10730697edb8b8b1e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/configuration_cvt.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/modeling_cvt.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/modeling_cvt.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f57948c30c06838112715c42757fac3ca2c97c5e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/modeling_cvt.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/modeling_tf_cvt.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/modeling_tf_cvt.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bc4417a9f3444ece7865e021c95230382005512
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/cvt/__pycache__/modeling_tf_cvt.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cvt/configuration_cvt.py b/.venv/lib/python3.11/site-packages/transformers/models/cvt/configuration_cvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..38cba6874f6860e1270aa4aea8166df309b4f014
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/cvt/configuration_cvt.py
@@ -0,0 +1,146 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CvT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CvtConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CvtModel`]. It is used to instantiate a CvT model
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the CvT
+ [microsoft/cvt-13](https://huggingface.co/microsoft/cvt-13) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ patch_sizes (`List[int]`, *optional*, defaults to `[7, 3, 3]`):
+ The kernel size of each encoder's patch embedding.
+ patch_stride (`List[int]`, *optional*, defaults to `[4, 2, 2]`):
+ The stride size of each encoder's patch embedding.
+ patch_padding (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
+ The padding size of each encoder's patch embedding.
+ embed_dim (`List[int]`, *optional*, defaults to `[64, 192, 384]`):
+ Dimension of each of the encoder blocks.
+ num_heads (`List[int]`, *optional*, defaults to `[1, 3, 6]`):
+ Number of attention heads for each attention layer in each block of the Transformer encoder.
+ depth (`List[int]`, *optional*, defaults to `[1, 2, 10]`):
+ The number of layers in each encoder block.
+ mlp_ratios (`List[float]`, *optional*, defaults to `[4.0, 4.0, 4.0, 4.0]`):
+ Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+ encoder blocks.
+ attention_drop_rate (`List[float]`, *optional*, defaults to `[0.0, 0.0, 0.0]`):
+ The dropout ratio for the attention probabilities.
+ drop_rate (`List[float]`, *optional*, defaults to `[0.0, 0.0, 0.0]`):
+ The dropout ratio for the patch embeddings probabilities.
+ drop_path_rate (`List[float]`, *optional*, defaults to `[0.0, 0.0, 0.1]`):
+ The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
+ qkv_bias (`List[bool]`, *optional*, defaults to `[True, True, True]`):
+ The bias bool for query, key and value in attentions
+ cls_token (`List[bool]`, *optional*, defaults to `[False, False, True]`):
+ Whether or not to add a classification token to the output of each of the last 3 stages.
+ qkv_projection_method (`List[string]`, *optional*, defaults to ["dw_bn", "dw_bn", "dw_bn"]`):
+ The projection method for query, key and value Default is depth-wise convolutions with batch norm. For
+ Linear projection use "avg".
+ kernel_qkv (`List[int]`, *optional*, defaults to `[3, 3, 3]`):
+ The kernel size for query, key and value in attention layer
+ padding_kv (`List[int]`, *optional*, defaults to `[1, 1, 1]`):
+ The padding size for key and value in attention layer
+ stride_kv (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
+ The stride size for key and value in attention layer
+ padding_q (`List[int]`, *optional*, defaults to `[1, 1, 1]`):
+ The padding size for query in attention layer
+ stride_q (`List[int]`, *optional*, defaults to `[1, 1, 1]`):
+ The stride size for query in attention layer
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+ The epsilon used by the layer normalization layers.
+
+ Example:
+
+ ```python
+ >>> from transformers import CvtConfig, CvtModel
+
+ >>> # Initializing a Cvt msft/cvt style configuration
+ >>> configuration = CvtConfig()
+
+ >>> # Initializing a model (with random weights) from the msft/cvt style configuration
+ >>> model = CvtModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "cvt"
+
+ def __init__(
+ self,
+ num_channels=3,
+ patch_sizes=[7, 3, 3],
+ patch_stride=[4, 2, 2],
+ patch_padding=[2, 1, 1],
+ embed_dim=[64, 192, 384],
+ num_heads=[1, 3, 6],
+ depth=[1, 2, 10],
+ mlp_ratio=[4.0, 4.0, 4.0],
+ attention_drop_rate=[0.0, 0.0, 0.0],
+ drop_rate=[0.0, 0.0, 0.0],
+ drop_path_rate=[0.0, 0.0, 0.1],
+ qkv_bias=[True, True, True],
+ cls_token=[False, False, True],
+ qkv_projection_method=["dw_bn", "dw_bn", "dw_bn"],
+ kernel_qkv=[3, 3, 3],
+ padding_kv=[1, 1, 1],
+ stride_kv=[2, 2, 2],
+ padding_q=[1, 1, 1],
+ stride_q=[1, 1, 1],
+ initializer_range=0.02,
+ layer_norm_eps=1e-12,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.num_channels = num_channels
+ self.patch_sizes = patch_sizes
+ self.patch_stride = patch_stride
+ self.patch_padding = patch_padding
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.depth = depth
+ self.mlp_ratio = mlp_ratio
+ self.attention_drop_rate = attention_drop_rate
+ self.drop_rate = drop_rate
+ self.drop_path_rate = drop_path_rate
+ self.qkv_bias = qkv_bias
+ self.cls_token = cls_token
+ self.qkv_projection_method = qkv_projection_method
+ self.kernel_qkv = kernel_qkv
+ self.padding_kv = padding_kv
+ self.stride_kv = stride_kv
+ self.padding_q = padding_q
+ self.stride_q = stride_q
+ self.initializer_range = initializer_range
+ self.layer_norm_eps = layer_norm_eps
+
+
+__all__ = ["CvtConfig"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cvt/modeling_cvt.py b/.venv/lib/python3.11/site-packages/transformers/models/cvt/modeling_cvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd68b391ba1ff414b96fa57f91e2450284adf6e9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/cvt/modeling_cvt.py
@@ -0,0 +1,725 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CvT model."""
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import logging
+from .configuration_cvt import CvtConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "CvtConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "microsoft/cvt-13"
+_EXPECTED_OUTPUT_SHAPE = [1, 384, 14, 14]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "microsoft/cvt-13"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+@dataclass
+class BaseModelOutputWithCLSToken(ModelOutput):
+ """
+ Base class for model's outputs, with potential hidden states and attentions.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
+ Classification token at the output of the last layer of the model.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+ plus the initial embedding outputs.
+ """
+
+ last_hidden_state: torch.FloatTensor = None
+ cls_token_value: torch.FloatTensor = None
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+ """
+ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+ Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+ however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+ layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+ argument.
+ """
+ if drop_prob == 0.0 or not training:
+ return input
+ keep_prob = 1 - drop_prob
+ shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+ random_tensor.floor_() # binarize
+ output = input.div(keep_prob) * random_tensor
+ return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class CvtDropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob: Optional[float] = None) -> None:
+ super().__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ return drop_path(hidden_states, self.drop_prob, self.training)
+
+ def extra_repr(self) -> str:
+ return "p={}".format(self.drop_prob)
+
+
+class CvtEmbeddings(nn.Module):
+ """
+ Construct the CvT embeddings.
+ """
+
+ def __init__(self, patch_size, num_channels, embed_dim, stride, padding, dropout_rate):
+ super().__init__()
+ self.convolution_embeddings = CvtConvEmbeddings(
+ patch_size=patch_size, num_channels=num_channels, embed_dim=embed_dim, stride=stride, padding=padding
+ )
+ self.dropout = nn.Dropout(dropout_rate)
+
+ def forward(self, pixel_values):
+ hidden_state = self.convolution_embeddings(pixel_values)
+ hidden_state = self.dropout(hidden_state)
+ return hidden_state
+
+
+class CvtConvEmbeddings(nn.Module):
+ """
+ Image to Conv Embedding.
+ """
+
+ def __init__(self, patch_size, num_channels, embed_dim, stride, padding):
+ super().__init__()
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ self.patch_size = patch_size
+ self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
+ self.normalization = nn.LayerNorm(embed_dim)
+
+ def forward(self, pixel_values):
+ pixel_values = self.projection(pixel_values)
+ batch_size, num_channels, height, width = pixel_values.shape
+ hidden_size = height * width
+ # rearrange "b c h w -> b (h w) c"
+ pixel_values = pixel_values.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
+ if self.normalization:
+ pixel_values = self.normalization(pixel_values)
+ # rearrange "b (h w) c" -> b c h w"
+ pixel_values = pixel_values.permute(0, 2, 1).view(batch_size, num_channels, height, width)
+ return pixel_values
+
+
+class CvtSelfAttentionConvProjection(nn.Module):
+ def __init__(self, embed_dim, kernel_size, padding, stride):
+ super().__init__()
+ self.convolution = nn.Conv2d(
+ embed_dim,
+ embed_dim,
+ kernel_size=kernel_size,
+ padding=padding,
+ stride=stride,
+ bias=False,
+ groups=embed_dim,
+ )
+ self.normalization = nn.BatchNorm2d(embed_dim)
+
+ def forward(self, hidden_state):
+ hidden_state = self.convolution(hidden_state)
+ hidden_state = self.normalization(hidden_state)
+ return hidden_state
+
+
+class CvtSelfAttentionLinearProjection(nn.Module):
+ def forward(self, hidden_state):
+ batch_size, num_channels, height, width = hidden_state.shape
+ hidden_size = height * width
+ # rearrange " b c h w -> b (h w) c"
+ hidden_state = hidden_state.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
+ return hidden_state
+
+
+class CvtSelfAttentionProjection(nn.Module):
+ def __init__(self, embed_dim, kernel_size, padding, stride, projection_method="dw_bn"):
+ super().__init__()
+ if projection_method == "dw_bn":
+ self.convolution_projection = CvtSelfAttentionConvProjection(embed_dim, kernel_size, padding, stride)
+ self.linear_projection = CvtSelfAttentionLinearProjection()
+
+ def forward(self, hidden_state):
+ hidden_state = self.convolution_projection(hidden_state)
+ hidden_state = self.linear_projection(hidden_state)
+ return hidden_state
+
+
+class CvtSelfAttention(nn.Module):
+ def __init__(
+ self,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ with_cls_token=True,
+ **kwargs,
+ ):
+ super().__init__()
+ self.scale = embed_dim**-0.5
+ self.with_cls_token = with_cls_token
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+
+ self.convolution_projection_query = CvtSelfAttentionProjection(
+ embed_dim,
+ kernel_size,
+ padding_q,
+ stride_q,
+ projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
+ )
+ self.convolution_projection_key = CvtSelfAttentionProjection(
+ embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
+ )
+ self.convolution_projection_value = CvtSelfAttentionProjection(
+ embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
+ )
+
+ self.projection_query = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+ self.projection_key = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+ self.projection_value = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+
+ self.dropout = nn.Dropout(attention_drop_rate)
+
+ def rearrange_for_multi_head_attention(self, hidden_state):
+ batch_size, hidden_size, _ = hidden_state.shape
+ head_dim = self.embed_dim // self.num_heads
+ # rearrange 'b t (h d) -> b h t d'
+ return hidden_state.view(batch_size, hidden_size, self.num_heads, head_dim).permute(0, 2, 1, 3)
+
+ def forward(self, hidden_state, height, width):
+ if self.with_cls_token:
+ cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
+ batch_size, hidden_size, num_channels = hidden_state.shape
+ # rearrange "b (h w) c -> b c h w"
+ hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)
+
+ key = self.convolution_projection_key(hidden_state)
+ query = self.convolution_projection_query(hidden_state)
+ value = self.convolution_projection_value(hidden_state)
+
+ if self.with_cls_token:
+ query = torch.cat((cls_token, query), dim=1)
+ key = torch.cat((cls_token, key), dim=1)
+ value = torch.cat((cls_token, value), dim=1)
+
+ head_dim = self.embed_dim // self.num_heads
+
+ query = self.rearrange_for_multi_head_attention(self.projection_query(query))
+ key = self.rearrange_for_multi_head_attention(self.projection_key(key))
+ value = self.rearrange_for_multi_head_attention(self.projection_value(value))
+
+ attention_score = torch.einsum("bhlk,bhtk->bhlt", [query, key]) * self.scale
+ attention_probs = torch.nn.functional.softmax(attention_score, dim=-1)
+ attention_probs = self.dropout(attention_probs)
+
+ context = torch.einsum("bhlt,bhtv->bhlv", [attention_probs, value])
+ # rearrange"b h t d -> b t (h d)"
+ _, _, hidden_size, _ = context.shape
+ context = context.permute(0, 2, 1, 3).contiguous().view(batch_size, hidden_size, self.num_heads * head_dim)
+ return context
+
+
+class CvtSelfOutput(nn.Module):
+ """
+ The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
+ layernorm applied before each block.
+ """
+
+ def __init__(self, embed_dim, drop_rate):
+ super().__init__()
+ self.dense = nn.Linear(embed_dim, embed_dim)
+ self.dropout = nn.Dropout(drop_rate)
+
+ def forward(self, hidden_state, input_tensor):
+ hidden_state = self.dense(hidden_state)
+ hidden_state = self.dropout(hidden_state)
+ return hidden_state
+
+
+class CvtAttention(nn.Module):
+ def __init__(
+ self,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ drop_rate,
+ with_cls_token=True,
+ ):
+ super().__init__()
+ self.attention = CvtSelfAttention(
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ with_cls_token,
+ )
+ self.output = CvtSelfOutput(embed_dim, drop_rate)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+ )
+
+ # Prune linear layers
+ self.attention.query = prune_linear_layer(self.attention.query, index)
+ self.attention.key = prune_linear_layer(self.attention.key, index)
+ self.attention.value = prune_linear_layer(self.attention.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+ self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(self, hidden_state, height, width):
+ self_output = self.attention(hidden_state, height, width)
+ attention_output = self.output(self_output, hidden_state)
+ return attention_output
+
+
+class CvtIntermediate(nn.Module):
+ def __init__(self, embed_dim, mlp_ratio):
+ super().__init__()
+ self.dense = nn.Linear(embed_dim, int(embed_dim * mlp_ratio))
+ self.activation = nn.GELU()
+
+ def forward(self, hidden_state):
+ hidden_state = self.dense(hidden_state)
+ hidden_state = self.activation(hidden_state)
+ return hidden_state
+
+
+class CvtOutput(nn.Module):
+ def __init__(self, embed_dim, mlp_ratio, drop_rate):
+ super().__init__()
+ self.dense = nn.Linear(int(embed_dim * mlp_ratio), embed_dim)
+ self.dropout = nn.Dropout(drop_rate)
+
+ def forward(self, hidden_state, input_tensor):
+ hidden_state = self.dense(hidden_state)
+ hidden_state = self.dropout(hidden_state)
+ hidden_state = hidden_state + input_tensor
+ return hidden_state
+
+
+class CvtLayer(nn.Module):
+ """
+ CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
+ """
+
+ def __init__(
+ self,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ drop_rate,
+ mlp_ratio,
+ drop_path_rate,
+ with_cls_token=True,
+ ):
+ super().__init__()
+ self.attention = CvtAttention(
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ drop_rate,
+ with_cls_token,
+ )
+
+ self.intermediate = CvtIntermediate(embed_dim, mlp_ratio)
+ self.output = CvtOutput(embed_dim, mlp_ratio, drop_rate)
+ self.drop_path = CvtDropPath(drop_prob=drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+ self.layernorm_before = nn.LayerNorm(embed_dim)
+ self.layernorm_after = nn.LayerNorm(embed_dim)
+
+ def forward(self, hidden_state, height, width):
+ self_attention_output = self.attention(
+ self.layernorm_before(hidden_state), # in Cvt, layernorm is applied before self-attention
+ height,
+ width,
+ )
+ attention_output = self_attention_output
+ attention_output = self.drop_path(attention_output)
+
+ # first residual connection
+ hidden_state = attention_output + hidden_state
+
+ # in Cvt, layernorm is also applied after self-attention
+ layer_output = self.layernorm_after(hidden_state)
+ layer_output = self.intermediate(layer_output)
+
+ # second residual connection is done here
+ layer_output = self.output(layer_output, hidden_state)
+ layer_output = self.drop_path(layer_output)
+ return layer_output
+
+
+class CvtStage(nn.Module):
+ def __init__(self, config, stage):
+ super().__init__()
+ self.config = config
+ self.stage = stage
+ if self.config.cls_token[self.stage]:
+ self.cls_token = nn.Parameter(torch.randn(1, 1, self.config.embed_dim[-1]))
+
+ self.embedding = CvtEmbeddings(
+ patch_size=config.patch_sizes[self.stage],
+ stride=config.patch_stride[self.stage],
+ num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
+ embed_dim=config.embed_dim[self.stage],
+ padding=config.patch_padding[self.stage],
+ dropout_rate=config.drop_rate[self.stage],
+ )
+
+ drop_path_rates = [x.item() for x in torch.linspace(0, config.drop_path_rate[self.stage], config.depth[stage])]
+
+ self.layers = nn.Sequential(
+ *[
+ CvtLayer(
+ num_heads=config.num_heads[self.stage],
+ embed_dim=config.embed_dim[self.stage],
+ kernel_size=config.kernel_qkv[self.stage],
+ padding_q=config.padding_q[self.stage],
+ padding_kv=config.padding_kv[self.stage],
+ stride_kv=config.stride_kv[self.stage],
+ stride_q=config.stride_q[self.stage],
+ qkv_projection_method=config.qkv_projection_method[self.stage],
+ qkv_bias=config.qkv_bias[self.stage],
+ attention_drop_rate=config.attention_drop_rate[self.stage],
+ drop_rate=config.drop_rate[self.stage],
+ drop_path_rate=drop_path_rates[self.stage],
+ mlp_ratio=config.mlp_ratio[self.stage],
+ with_cls_token=config.cls_token[self.stage],
+ )
+ for _ in range(config.depth[self.stage])
+ ]
+ )
+
+ def forward(self, hidden_state):
+ cls_token = None
+ hidden_state = self.embedding(hidden_state)
+ batch_size, num_channels, height, width = hidden_state.shape
+ # rearrange b c h w -> b (h w) c"
+ hidden_state = hidden_state.view(batch_size, num_channels, height * width).permute(0, 2, 1)
+ if self.config.cls_token[self.stage]:
+ cls_token = self.cls_token.expand(batch_size, -1, -1)
+ hidden_state = torch.cat((cls_token, hidden_state), dim=1)
+
+ for layer in self.layers:
+ layer_outputs = layer(hidden_state, height, width)
+ hidden_state = layer_outputs
+
+ if self.config.cls_token[self.stage]:
+ cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
+ hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)
+ return hidden_state, cls_token
+
+
+class CvtEncoder(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.stages = nn.ModuleList([])
+ for stage_idx in range(len(config.depth)):
+ self.stages.append(CvtStage(config, stage_idx))
+
+ def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
+ all_hidden_states = () if output_hidden_states else None
+ hidden_state = pixel_values
+
+ cls_token = None
+ for _, (stage_module) in enumerate(self.stages):
+ hidden_state, cls_token = stage_module(hidden_state)
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_state,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
+
+ return BaseModelOutputWithCLSToken(
+ last_hidden_state=hidden_state,
+ cls_token_value=cls_token,
+ hidden_states=all_hidden_states,
+ )
+
+
+class CvtPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = CvtConfig
+ base_model_prefix = "cvt"
+ main_input_name = "pixel_values"
+ _no_split_modules = ["CvtLayer"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
+ module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, CvtStage):
+ if self.config.cls_token[module.stage]:
+ module.cls_token.data = nn.init.trunc_normal_(
+ torch.zeros(1, 1, self.config.embed_dim[-1]), mean=0.0, std=self.config.initializer_range
+ )
+
+
+CVT_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+ as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CVT_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
+ for details.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.",
+ CVT_START_DOCSTRING,
+)
+class CvtModel(CvtPreTrainedModel):
+ def __init__(self, config, add_pooling_layer=True):
+ super().__init__(config)
+ self.config = config
+ self.encoder = CvtEncoder(config)
+ self.post_init()
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutputWithCLSToken,
+ config_class=_CONFIG_FOR_DOC,
+ modality="vision",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithCLSToken]:
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ encoder_outputs = self.encoder(
+ pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ sequence_output = encoder_outputs[0]
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[1:]
+
+ return BaseModelOutputWithCLSToken(
+ last_hidden_state=sequence_output,
+ cls_token_value=encoder_outputs.cls_token_value,
+ hidden_states=encoder_outputs.hidden_states,
+ )
+
+
+@add_start_docstrings(
+ """
+ Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+ the [CLS] token) e.g. for ImageNet.
+ """,
+ CVT_START_DOCSTRING,
+)
+class CvtForImageClassification(CvtPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.cvt = CvtModel(config, add_pooling_layer=False)
+ self.layernorm = nn.LayerNorm(config.embed_dim[-1])
+ # Classifier head
+ self.classifier = (
+ nn.Linear(config.embed_dim[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+ )
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
+ output_type=ImageClassifierOutputWithNoAttention,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ outputs = self.cvt(
+ pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+ cls_token = outputs[1]
+ if self.config.cls_token[-1]:
+ sequence_output = self.layernorm(cls_token)
+ else:
+ batch_size, num_channels, height, width = sequence_output.shape
+ # rearrange "b c h w -> b (h w) c"
+ sequence_output = sequence_output.view(batch_size, num_channels, height * width).permute(0, 2, 1)
+ sequence_output = self.layernorm(sequence_output)
+
+ sequence_output_mean = sequence_output.mean(dim=1)
+ logits = self.classifier(sequence_output_mean)
+
+ loss = None
+ if labels is not None:
+ if self.config.problem_type is None:
+ if self.config.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.config.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["CvtForImageClassification", "CvtModel", "CvtPreTrainedModel"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/cvt/modeling_tf_cvt.py b/.venv/lib/python3.11/site-packages/transformers/models/cvt/modeling_tf_cvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa9a4d9a3a4450c51610a47632dbca6c9faf9a79
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/cvt/modeling_tf_cvt.py
@@ -0,0 +1,1096 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 Cvt model."""
+
+from __future__ import annotations
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import TFImageClassifierOutputWithNoAttention
+from ...modeling_tf_utils import (
+ TFModelInputType,
+ TFPreTrainedModel,
+ TFSequenceClassificationLoss,
+ get_initializer,
+ keras,
+ keras_serializable,
+ unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_cvt import CvtConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "CvtConfig"
+
+
+@dataclass
+class TFBaseModelOutputWithCLSToken(ModelOutput):
+ """
+ Base class for model's outputs.
+
+ Args:
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
+ Classification token at the output of the last layer of the model.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+ `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+ the initial embedding outputs.
+ """
+
+ last_hidden_state: tf.Tensor = None
+ cls_token_value: tf.Tensor = None
+ hidden_states: Tuple[tf.Tensor, ...] | None = None
+
+
+class TFCvtDropPath(keras.layers.Layer):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ References:
+ (1) github.com:rwightman/pytorch-image-models
+ """
+
+ def __init__(self, drop_prob: float, **kwargs):
+ super().__init__(**kwargs)
+ self.drop_prob = drop_prob
+
+ def call(self, x: tf.Tensor, training=None):
+ if self.drop_prob == 0.0 or not training:
+ return x
+ keep_prob = 1 - self.drop_prob
+ shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+ random_tensor = keep_prob + tf.random.uniform(shape, 0, 1, dtype=self.compute_dtype)
+ random_tensor = tf.floor(random_tensor)
+ return (x / keep_prob) * random_tensor
+
+
+class TFCvtEmbeddings(keras.layers.Layer):
+ """Construct the Convolutional Token Embeddings."""
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ patch_size: int,
+ num_channels: int,
+ embed_dim: int,
+ stride: int,
+ padding: int,
+ dropout_rate: float,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.convolution_embeddings = TFCvtConvEmbeddings(
+ config,
+ patch_size=patch_size,
+ num_channels=num_channels,
+ embed_dim=embed_dim,
+ stride=stride,
+ padding=padding,
+ name="convolution_embeddings",
+ )
+ self.dropout = keras.layers.Dropout(dropout_rate)
+
+ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.convolution_embeddings(pixel_values)
+ hidden_state = self.dropout(hidden_state, training=training)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "convolution_embeddings", None) is not None:
+ with tf.name_scope(self.convolution_embeddings.name):
+ self.convolution_embeddings.build(None)
+
+
+class TFCvtConvEmbeddings(keras.layers.Layer):
+ """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ patch_size: int,
+ num_channels: int,
+ embed_dim: int,
+ stride: int,
+ padding: int,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.padding = keras.layers.ZeroPadding2D(padding=padding)
+ self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ self.projection = keras.layers.Conv2D(
+ filters=embed_dim,
+ kernel_size=patch_size,
+ strides=stride,
+ padding="valid",
+ data_format="channels_last",
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="projection",
+ )
+ # Using the same default epsilon as PyTorch
+ self.normalization = keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
+ self.num_channels = num_channels
+ self.embed_dim = embed_dim
+
+ def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
+ if isinstance(pixel_values, dict):
+ pixel_values = pixel_values["pixel_values"]
+
+ pixel_values = self.projection(self.padding(pixel_values))
+
+ # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
+ batch_size, height, width, num_channels = shape_list(pixel_values)
+ hidden_size = height * width
+ pixel_values = tf.reshape(pixel_values, shape=(batch_size, hidden_size, num_channels))
+ pixel_values = self.normalization(pixel_values)
+
+ # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
+ pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels))
+ return pixel_values
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "projection", None) is not None:
+ with tf.name_scope(self.projection.name):
+ self.projection.build([None, None, None, self.num_channels])
+ if getattr(self, "normalization", None) is not None:
+ with tf.name_scope(self.normalization.name):
+ self.normalization.build([None, None, self.embed_dim])
+
+
+class TFCvtSelfAttentionConvProjection(keras.layers.Layer):
+ """Convolutional projection layer."""
+
+ def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs):
+ super().__init__(**kwargs)
+ self.padding = keras.layers.ZeroPadding2D(padding=padding)
+ self.convolution = keras.layers.Conv2D(
+ filters=embed_dim,
+ kernel_size=kernel_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ padding="valid",
+ strides=stride,
+ use_bias=False,
+ name="convolution",
+ groups=embed_dim,
+ )
+ # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum)
+ self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
+ self.embed_dim = embed_dim
+
+ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.convolution(self.padding(hidden_state))
+ hidden_state = self.normalization(hidden_state, training=training)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "convolution", None) is not None:
+ with tf.name_scope(self.convolution.name):
+ self.convolution.build([None, None, None, self.embed_dim])
+ if getattr(self, "normalization", None) is not None:
+ with tf.name_scope(self.normalization.name):
+ self.normalization.build([None, None, None, self.embed_dim])
+
+
+class TFCvtSelfAttentionLinearProjection(keras.layers.Layer):
+ """Linear projection layer used to flatten tokens into 1D."""
+
+ def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
+ # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
+ batch_size, height, width, num_channels = shape_list(hidden_state)
+ hidden_size = height * width
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
+ return hidden_state
+
+
+class TFCvtSelfAttentionProjection(keras.layers.Layer):
+ """Convolutional Projection for Attention."""
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ embed_dim: int,
+ kernel_size: int,
+ stride: int,
+ padding: int,
+ projection_method: str = "dw_bn",
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ if projection_method == "dw_bn":
+ self.convolution_projection = TFCvtSelfAttentionConvProjection(
+ config, embed_dim, kernel_size, stride, padding, name="convolution_projection"
+ )
+ self.linear_projection = TFCvtSelfAttentionLinearProjection()
+
+ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.convolution_projection(hidden_state, training=training)
+ hidden_state = self.linear_projection(hidden_state)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "convolution_projection", None) is not None:
+ with tf.name_scope(self.convolution_projection.name):
+ self.convolution_projection.build(None)
+
+
+class TFCvtSelfAttention(keras.layers.Layer):
+ """
+ Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
+ query, key, and value embeddings.
+ """
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ num_heads: int,
+ embed_dim: int,
+ kernel_size: int,
+ stride_q: int,
+ stride_kv: int,
+ padding_q: int,
+ padding_kv: int,
+ qkv_projection_method: str,
+ qkv_bias: bool,
+ attention_drop_rate: float,
+ with_cls_token: bool = True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.scale = embed_dim**-0.5
+ self.with_cls_token = with_cls_token
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+
+ self.convolution_projection_query = TFCvtSelfAttentionProjection(
+ config,
+ embed_dim,
+ kernel_size,
+ stride_q,
+ padding_q,
+ projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
+ name="convolution_projection_query",
+ )
+ self.convolution_projection_key = TFCvtSelfAttentionProjection(
+ config,
+ embed_dim,
+ kernel_size,
+ stride_kv,
+ padding_kv,
+ projection_method=qkv_projection_method,
+ name="convolution_projection_key",
+ )
+ self.convolution_projection_value = TFCvtSelfAttentionProjection(
+ config,
+ embed_dim,
+ kernel_size,
+ stride_kv,
+ padding_kv,
+ projection_method=qkv_projection_method,
+ name="convolution_projection_value",
+ )
+
+ self.projection_query = keras.layers.Dense(
+ units=embed_dim,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=qkv_bias,
+ bias_initializer="zeros",
+ name="projection_query",
+ )
+ self.projection_key = keras.layers.Dense(
+ units=embed_dim,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=qkv_bias,
+ bias_initializer="zeros",
+ name="projection_key",
+ )
+ self.projection_value = keras.layers.Dense(
+ units=embed_dim,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=qkv_bias,
+ bias_initializer="zeros",
+ name="projection_value",
+ )
+ self.dropout = keras.layers.Dropout(attention_drop_rate)
+
+ def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor:
+ batch_size, hidden_size, _ = shape_list(hidden_state)
+ head_dim = self.embed_dim // self.num_heads
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, self.num_heads, head_dim))
+ hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3))
+ return hidden_state
+
+ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
+ if self.with_cls_token:
+ cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
+
+ # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
+ batch_size, hidden_size, num_channels = shape_list(hidden_state)
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
+
+ key = self.convolution_projection_key(hidden_state, training=training)
+ query = self.convolution_projection_query(hidden_state, training=training)
+ value = self.convolution_projection_value(hidden_state, training=training)
+
+ if self.with_cls_token:
+ query = tf.concat((cls_token, query), axis=1)
+ key = tf.concat((cls_token, key), axis=1)
+ value = tf.concat((cls_token, value), axis=1)
+
+ head_dim = self.embed_dim // self.num_heads
+
+ query = self.rearrange_for_multi_head_attention(self.projection_query(query))
+ key = self.rearrange_for_multi_head_attention(self.projection_key(key))
+ value = self.rearrange_for_multi_head_attention(self.projection_value(value))
+
+ attention_score = tf.matmul(query, key, transpose_b=True) * self.scale
+ attention_probs = stable_softmax(logits=attention_score, axis=-1)
+ attention_probs = self.dropout(attention_probs, training=training)
+
+ context = tf.matmul(attention_probs, value)
+ # "batch_size, num_heads, hidden_size, head_dim -> batch_size, hidden_size, (num_heads*head_dim)"
+ _, _, hidden_size, _ = shape_list(context)
+ context = tf.transpose(context, perm=(0, 2, 1, 3))
+ context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim))
+ return context
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "convolution_projection_query", None) is not None:
+ with tf.name_scope(self.convolution_projection_query.name):
+ self.convolution_projection_query.build(None)
+ if getattr(self, "convolution_projection_key", None) is not None:
+ with tf.name_scope(self.convolution_projection_key.name):
+ self.convolution_projection_key.build(None)
+ if getattr(self, "convolution_projection_value", None) is not None:
+ with tf.name_scope(self.convolution_projection_value.name):
+ self.convolution_projection_value.build(None)
+ if getattr(self, "projection_query", None) is not None:
+ with tf.name_scope(self.projection_query.name):
+ self.projection_query.build([None, None, self.embed_dim])
+ if getattr(self, "projection_key", None) is not None:
+ with tf.name_scope(self.projection_key.name):
+ self.projection_key.build([None, None, self.embed_dim])
+ if getattr(self, "projection_value", None) is not None:
+ with tf.name_scope(self.projection_value.name):
+ self.projection_value.build([None, None, self.embed_dim])
+
+
+class TFCvtSelfOutput(keras.layers.Layer):
+ """Output of the Attention layer ."""
+
+ def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(
+ units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.dropout = keras.layers.Dropout(drop_rate)
+ self.embed_dim = embed_dim
+
+ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.dense(inputs=hidden_state)
+ hidden_state = self.dropout(inputs=hidden_state, training=training)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.embed_dim])
+
+
+class TFCvtAttention(keras.layers.Layer):
+ """Attention layer. First chunk of the convolutional transformer block."""
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ num_heads: int,
+ embed_dim: int,
+ kernel_size: int,
+ stride_q: int,
+ stride_kv: int,
+ padding_q: int,
+ padding_kv: int,
+ qkv_projection_method: str,
+ qkv_bias: bool,
+ attention_drop_rate: float,
+ drop_rate: float,
+ with_cls_token: bool = True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.attention = TFCvtSelfAttention(
+ config,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ stride_q,
+ stride_kv,
+ padding_q,
+ padding_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ with_cls_token,
+ name="attention",
+ )
+ self.dense_output = TFCvtSelfOutput(config, embed_dim, drop_rate, name="output")
+
+ def prune_heads(self, heads):
+ raise NotImplementedError
+
+ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False):
+ self_output = self.attention(hidden_state, height, width, training=training)
+ attention_output = self.dense_output(self_output, training=training)
+ return attention_output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "dense_output", None) is not None:
+ with tf.name_scope(self.dense_output.name):
+ self.dense_output.build(None)
+
+
+class TFCvtIntermediate(keras.layers.Layer):
+ """Intermediate dense layer. Second chunk of the convolutional transformer block."""
+
+ def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(
+ units=int(embed_dim * mlp_ratio),
+ kernel_initializer=get_initializer(config.initializer_range),
+ activation="gelu",
+ name="dense",
+ )
+ self.embed_dim = embed_dim
+
+ def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
+ hidden_state = self.dense(hidden_state)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.embed_dim])
+
+
+class TFCvtOutput(keras.layers.Layer):
+ """
+ Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
+ """
+
+ def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(
+ units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.dropout = keras.layers.Dropout(drop_rate)
+ self.embed_dim = embed_dim
+ self.mlp_ratio = mlp_ratio
+
+ def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.dense(inputs=hidden_state)
+ hidden_state = self.dropout(inputs=hidden_state, training=training)
+ hidden_state = hidden_state + input_tensor
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)])
+
+
+class TFCvtLayer(keras.layers.Layer):
+ """
+ Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
+ consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
+ `Block` class in the original implementation.
+ """
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ num_heads: int,
+ embed_dim: int,
+ kernel_size: int,
+ stride_q: int,
+ stride_kv: int,
+ padding_q: int,
+ padding_kv: int,
+ qkv_projection_method: str,
+ qkv_bias: bool,
+ attention_drop_rate: float,
+ drop_rate: float,
+ mlp_ratio: float,
+ drop_path_rate: float,
+ with_cls_token: bool = True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.attention = TFCvtAttention(
+ config,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ stride_q,
+ stride_kv,
+ padding_q,
+ padding_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ drop_rate,
+ with_cls_token,
+ name="attention",
+ )
+ self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate")
+ self.dense_output = TFCvtOutput(config, embed_dim, mlp_ratio, drop_rate, name="output")
+ # Using `layers.Activation` instead of `tf.identity` to better control `training` behaviour.
+ self.drop_path = (
+ TFCvtDropPath(drop_path_rate, name="drop_path")
+ if drop_path_rate > 0.0
+ else keras.layers.Activation("linear", name="drop_path")
+ )
+ # Using the same default epsilon as PyTorch
+ self.layernorm_before = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
+ self.layernorm_after = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
+ self.embed_dim = embed_dim
+
+ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
+ # in Cvt, layernorm is applied before self-attention
+ attention_output = self.attention(self.layernorm_before(hidden_state), height, width, training=training)
+ attention_output = self.drop_path(attention_output, training=training)
+
+ # first residual connection
+ hidden_state = attention_output + hidden_state
+
+ # in Cvt, layernorm is also applied after self-attention
+ layer_output = self.layernorm_after(hidden_state)
+ layer_output = self.intermediate(layer_output)
+
+ # second residual connection is done here
+ layer_output = self.dense_output(layer_output, hidden_state)
+ layer_output = self.drop_path(layer_output, training=training)
+ return layer_output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "intermediate", None) is not None:
+ with tf.name_scope(self.intermediate.name):
+ self.intermediate.build(None)
+ if getattr(self, "dense_output", None) is not None:
+ with tf.name_scope(self.dense_output.name):
+ self.dense_output.build(None)
+ if getattr(self, "drop_path", None) is not None:
+ with tf.name_scope(self.drop_path.name):
+ self.drop_path.build(None)
+ if getattr(self, "layernorm_before", None) is not None:
+ with tf.name_scope(self.layernorm_before.name):
+ self.layernorm_before.build([None, None, self.embed_dim])
+ if getattr(self, "layernorm_after", None) is not None:
+ with tf.name_scope(self.layernorm_after.name):
+ self.layernorm_after.build([None, None, self.embed_dim])
+
+
+class TFCvtStage(keras.layers.Layer):
+ """
+ Cvt stage (encoder block). Each stage has 2 parts :
+ - (1) A Convolutional Token Embedding layer
+ - (2) A Convolutional Transformer Block (layer).
+ The classification token is added only in the last stage.
+
+ Args:
+ config ([`CvtConfig`]): Model configuration class.
+ stage (`int`): Stage number.
+ """
+
+ def __init__(self, config: CvtConfig, stage: int, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.stage = stage
+ if self.config.cls_token[self.stage]:
+ self.cls_token = self.add_weight(
+ shape=(1, 1, self.config.embed_dim[-1]),
+ initializer=get_initializer(self.config.initializer_range),
+ trainable=True,
+ name="cvt.encoder.stages.2.cls_token",
+ )
+
+ self.embedding = TFCvtEmbeddings(
+ self.config,
+ patch_size=config.patch_sizes[self.stage],
+ num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
+ stride=config.patch_stride[self.stage],
+ embed_dim=config.embed_dim[self.stage],
+ padding=config.patch_padding[self.stage],
+ dropout_rate=config.drop_rate[self.stage],
+ name="embedding",
+ )
+
+ drop_path_rates = tf.linspace(0.0, config.drop_path_rate[self.stage], config.depth[stage])
+ drop_path_rates = [x.numpy().item() for x in drop_path_rates]
+ self.layers = [
+ TFCvtLayer(
+ config,
+ num_heads=config.num_heads[self.stage],
+ embed_dim=config.embed_dim[self.stage],
+ kernel_size=config.kernel_qkv[self.stage],
+ stride_q=config.stride_q[self.stage],
+ stride_kv=config.stride_kv[self.stage],
+ padding_q=config.padding_q[self.stage],
+ padding_kv=config.padding_kv[self.stage],
+ qkv_projection_method=config.qkv_projection_method[self.stage],
+ qkv_bias=config.qkv_bias[self.stage],
+ attention_drop_rate=config.attention_drop_rate[self.stage],
+ drop_rate=config.drop_rate[self.stage],
+ mlp_ratio=config.mlp_ratio[self.stage],
+ drop_path_rate=drop_path_rates[self.stage],
+ with_cls_token=config.cls_token[self.stage],
+ name=f"layers.{j}",
+ )
+ for j in range(config.depth[self.stage])
+ ]
+
+ def call(self, hidden_state: tf.Tensor, training: bool = False):
+ cls_token = None
+ hidden_state = self.embedding(hidden_state, training)
+
+ # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
+ batch_size, height, width, num_channels = shape_list(hidden_state)
+ hidden_size = height * width
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
+
+ if self.config.cls_token[self.stage]:
+ cls_token = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
+ hidden_state = tf.concat((cls_token, hidden_state), axis=1)
+
+ for layer in self.layers:
+ layer_outputs = layer(hidden_state, height, width, training=training)
+ hidden_state = layer_outputs
+
+ if self.config.cls_token[self.stage]:
+ cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
+
+ # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
+ return hidden_state, cls_token
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "embedding", None) is not None:
+ with tf.name_scope(self.embedding.name):
+ self.embedding.build(None)
+ if getattr(self, "layers", None) is not None:
+ for layer in self.layers:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+class TFCvtEncoder(keras.layers.Layer):
+ """
+ Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
+ (depth) being 1, 2 and 10.
+
+ Args:
+ config ([`CvtConfig`]): Model configuration class.
+ """
+
+ config_class = CvtConfig
+
+ def __init__(self, config: CvtConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.stages = [
+ TFCvtStage(config, stage_idx, name=f"stages.{stage_idx}") for stage_idx in range(len(config.depth))
+ ]
+
+ def call(
+ self,
+ pixel_values: TFModelInputType,
+ output_hidden_states: Optional[bool] = False,
+ return_dict: Optional[bool] = True,
+ training: Optional[bool] = False,
+ ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
+ all_hidden_states = () if output_hidden_states else None
+ hidden_state = pixel_values
+ # When running on CPU, `keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width)
+ # as input format. So change the input format to (batch_size, height, width, num_channels).
+ hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1))
+
+ cls_token = None
+ for _, (stage_module) in enumerate(self.stages):
+ hidden_state, cls_token = stage_module(hidden_state, training=training)
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_state,)
+
+ # Change back to (batch_size, num_channels, height, width) format to have uniformity in the modules
+ hidden_state = tf.transpose(hidden_state, perm=(0, 3, 1, 2))
+ if output_hidden_states:
+ all_hidden_states = tuple([tf.transpose(hs, perm=(0, 3, 1, 2)) for hs in all_hidden_states])
+
+ if not return_dict:
+ return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
+
+ return TFBaseModelOutputWithCLSToken(
+ last_hidden_state=hidden_state,
+ cls_token_value=cls_token,
+ hidden_states=all_hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "stages", None) is not None:
+ for layer in self.stages:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+@keras_serializable
+class TFCvtMainLayer(keras.layers.Layer):
+ """Construct the Cvt model."""
+
+ config_class = CvtConfig
+
+ def __init__(self, config: CvtConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.encoder = TFCvtEncoder(config, name="encoder")
+
+ @unpack_inputs
+ def call(
+ self,
+ pixel_values: TFModelInputType | None = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ encoder_outputs = self.encoder(
+ pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = encoder_outputs[0]
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[1:]
+
+ return TFBaseModelOutputWithCLSToken(
+ last_hidden_state=sequence_output,
+ cls_token_value=encoder_outputs.cls_token_value,
+ hidden_states=encoder_outputs.hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "encoder", None) is not None:
+ with tf.name_scope(self.encoder.name):
+ self.encoder.build(None)
+
+
+class TFCvtPreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = CvtConfig
+ base_model_prefix = "cvt"
+ main_input_name = "pixel_values"
+
+
+TFCVT_START_DOCSTRING = r"""
+
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+ as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+ behavior.
+
+
+
+ TF 2.0 models accepts two formats as inputs:
+
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
+
+ This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
+ tensors in the first argument of the model call function: `model(inputs)`.
+
+
+
+ Args:
+ config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TFCVT_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
+ for details.
+
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+ used instead.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+ eager mode, in graph mode the value will always be set to True.
+ training (`bool`, *optional*, defaults to `False``):
+ Whether or not to use the model in training mode (some modules like dropout modules have different
+ behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+ "The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.",
+ TFCVT_START_DOCSTRING,
+)
+class TFCvtModel(TFCvtPreTrainedModel):
+ def __init__(self, config: CvtConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.cvt = TFCvtMainLayer(config, name="cvt")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, TFCvtModel
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
+ >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")
+
+ >>> inputs = image_processor(images=image, return_tensors="tf")
+ >>> outputs = model(**inputs)
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ outputs = self.cvt(
+ pixel_values=pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ if not return_dict:
+ return (outputs[0],) + outputs[1:]
+
+ return TFBaseModelOutputWithCLSToken(
+ last_hidden_state=outputs.last_hidden_state,
+ cls_token_value=outputs.cls_token_value,
+ hidden_states=outputs.hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "cvt", None) is not None:
+ with tf.name_scope(self.cvt.name):
+ self.cvt.build(None)
+
+
+@add_start_docstrings(
+ """
+ Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+ the [CLS] token) e.g. for ImageNet.
+ """,
+ TFCVT_START_DOCSTRING,
+)
+class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassificationLoss):
+ def __init__(self, config: CvtConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.num_labels = config.num_labels
+ self.cvt = TFCvtMainLayer(config, name="cvt")
+ # Using same default epsilon as in the original implementation.
+ self.layernorm = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm")
+
+ # Classifier head
+ self.classifier = keras.layers.Dense(
+ units=config.num_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=True,
+ bias_initializer="zeros",
+ name="classifier",
+ )
+ self.config = config
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ labels: tf.Tensor | None = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, TFCvtForImageClassification
+ >>> import tensorflow as tf
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
+ >>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")
+
+ >>> inputs = image_processor(images=image, return_tensors="tf")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
+ >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
+ ```"""
+
+ outputs = self.cvt(
+ pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = outputs[0]
+ cls_token = outputs[1]
+ if self.config.cls_token[-1]:
+ sequence_output = self.layernorm(cls_token)
+ else:
+ # rearrange "batch_size, num_channels, height, width -> batch_size, (height*width), num_channels"
+ batch_size, num_channels, height, width = shape_list(sequence_output)
+ sequence_output = tf.reshape(sequence_output, shape=(batch_size, num_channels, height * width))
+ sequence_output = tf.transpose(sequence_output, perm=(0, 2, 1))
+ sequence_output = self.layernorm(sequence_output)
+
+ sequence_output_mean = tf.reduce_mean(sequence_output, axis=1)
+ logits = self.classifier(sequence_output_mean)
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "cvt", None) is not None:
+ with tf.name_scope(self.cvt.name):
+ self.cvt.build(None)
+ if getattr(self, "layernorm", None) is not None:
+ with tf.name_scope(self.layernorm.name):
+ self.layernorm.build([None, None, self.config.embed_dim[-1]])
+ if getattr(self, "classifier", None) is not None:
+ if hasattr(self.classifier, "name"):
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.config.embed_dim[-1]])
+
+
+__all__ = ["TFCvtForImageClassification", "TFCvtModel", "TFCvtPreTrainedModel"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c786feb9213fdd31640c0fdeaead5164026ad37a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_encoder_decoder import *
+ from .modeling_encoder_decoder import *
+ from .modeling_flax_encoder_decoder import *
+ from .modeling_tf_encoder_decoder import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7edea98fa597200916c1267cf063a762430e4d4b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/configuration_encoder_decoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/configuration_encoder_decoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82bac181fbb23cf46d047a435b58dc26071b80eb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/configuration_encoder_decoder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_encoder_decoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_encoder_decoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef857cf1e0361aa3877b6f27b52696a7158eb56a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_encoder_decoder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_flax_encoder_decoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_flax_encoder_decoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da8096a26260a2575c3f5cbca3b85109dc6b7c58
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_flax_encoder_decoder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_tf_encoder_decoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_tf_encoder_decoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..304a29cb29b6b367ee78b2a5c43f5504b11f1b45
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_tf_encoder_decoder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/configuration_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b5c62363f6ad9c3ce72c61bd597dac3f9078c0a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class EncoderDecoderConfig(PretrainedConfig):
+ r"""
+ [`EncoderDecoderConfig`] is the configuration class to store the configuration of a [`EncoderDecoderModel`]. It is
+ used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder
+ configs.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ kwargs (*optional*):
+ Dictionary of keyword arguments. Notably:
+
+ - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+ the encoder config.
+ - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+ the decoder config.
+
+ Examples:
+
+ ```python
+ >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+
+ >>> # Initializing a BERT google-bert/bert-base-uncased style configuration
+ >>> config_encoder = BertConfig()
+ >>> config_decoder = BertConfig()
+
+ >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+
+ >>> # Initializing a Bert2Bert model (with random weights) from the google-bert/bert-base-uncased style configurations
+ >>> model = EncoderDecoderModel(config=config)
+
+ >>> # Accessing the model configuration
+ >>> config_encoder = model.config.encoder
+ >>> config_decoder = model.config.decoder
+ >>> # set decoder config to causal lm
+ >>> config_decoder.is_decoder = True
+ >>> config_decoder.add_cross_attention = True
+
+ >>> # Saving the model, including its configuration
+ >>> model.save_pretrained("my-model")
+
+ >>> # loading model and config from pretrained folder
+ >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained("my-model")
+ >>> model = EncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
+ ```"""
+
+ model_type = "encoder-decoder"
+ sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
+ is_composition = True
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ if "encoder" not in kwargs or "decoder" not in kwargs:
+ raise ValueError(
+ f"A configuraton of type {self.model_type} cannot be instantiated because "
+ f"both `encoder` and `decoder` sub-configurations were not passed, only {kwargs}"
+ )
+ encoder_config = kwargs.pop("encoder")
+ encoder_model_type = encoder_config.pop("model_type")
+ decoder_config = kwargs.pop("decoder")
+ decoder_model_type = decoder_config.pop("model_type")
+
+ self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
+ self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
+ self.is_encoder_decoder = True
+
+ @classmethod
+ def from_encoder_decoder_configs(
+ cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
+ ) -> PretrainedConfig:
+ r"""
+ Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and
+ decoder model configuration.
+
+ Returns:
+ [`EncoderDecoderConfig`]: An instance of a configuration object
+ """
+ logger.info("Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
+ decoder_config.is_decoder = True
+ decoder_config.add_cross_attention = True
+
+ return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
+
+
+__all__ = ["EncoderDecoderConfig"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/modeling_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab4b7f2ced1674b0d51326ae2b1e8c6991aac93
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -0,0 +1,687 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classes to support Encoder-Decoder architectures"""
+
+import gc
+import inspect
+import os
+import tempfile
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...configuration_utils import PretrainedConfig
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM
+from .configuration_encoder_decoder import EncoderDecoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "EncoderDecoderConfig"
+
+DEPRECATION_WARNING = (
+ "Version v4.12.0 introduces a better way to train encoder-decoder models by computing the loss inside the"
+ " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if"
+ " fine-tuning a model trained with versions anterior to 4.12.0. The decoder_input_ids are now created based on the"
+ " labels, no need to pass them yourself anymore."
+)
+
+ENCODER_DECODER_START_DOCSTRING = r"""
+ This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
+ encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
+ [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`]
+ function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
+ generative task, like summarization.
+
+ The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
+ tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
+ Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+ Zhou, Wei Li, Peter J. Liu.
+
+ After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
+ (see the examples for more information).
+
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ENCODER_DECODER_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Indices of decoder input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+
+ For training, `decoder_input_ids` are automatically created by the model by shifting the `labels` to the
+ right, replacing -100 by the `pad_token_id` and prepending them with the `decoder_start_token_id`.
+ decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+ be used by default.
+ encoder_outputs (`tuple(torch.FloatTensor)`, *optional*):
+ This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+ `last_hidden_state` (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) is a tensor
+ of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the
+ decoder.
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `decoder_input_ids` indices
+ into associated vectors than the model's internal embedding lookup matrix.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0,
+ ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple.
+ kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
+
+ - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function.
+ - With a *decoder_* prefix which will be input as `**decoder_kwargs` for the decoder forward function.
+"""
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+ """
+ Shift input ids one token to the right.
+ """
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+ shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+ if decoder_start_token_id is None:
+ raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
+ shifted_input_ids[:, 0] = decoder_start_token_id
+
+ if pad_token_id is None:
+ raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
+ # replace possible -100 values in labels by `pad_token_id`
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+ return shifted_input_ids
+
+
+@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
+class EncoderDecoderModel(PreTrainedModel, GenerationMixin):
+ r"""
+ [`EncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one
+ of the base model classes of the library as encoder and another one as decoder when created with the
+ :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
+ :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
+ """
+
+ config_class = EncoderDecoderConfig
+ base_model_prefix = "encoder_decoder"
+ main_input_name = "input_ids"
+ supports_gradient_checkpointing = True
+ _supports_param_buffer_assignment = False
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+
+ def __init__(
+ self,
+ config: Optional[PretrainedConfig] = None,
+ encoder: Optional[PreTrainedModel] = None,
+ decoder: Optional[PreTrainedModel] = None,
+ ):
+ if config is None and (encoder is None or decoder is None):
+ raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
+ if config is None:
+ config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
+ else:
+ if not isinstance(config, self.config_class):
+ raise ValueError(f"Config: {config} has to be of type {self.config_class}")
+
+ if config.decoder.cross_attention_hidden_size is not None:
+ if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+ raise ValueError(
+ "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
+ f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+ f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
+ " `config.encoder.hidden_size`."
+ )
+
+ # initialize with config
+ super().__init__(config)
+
+ if encoder is None:
+ from ..auto.modeling_auto import AutoModel
+
+ encoder = AutoModel.from_config(config.encoder)
+
+ if decoder is None:
+ from ..auto.modeling_auto import AutoModelForCausalLM
+
+ decoder = AutoModelForCausalLM.from_config(config.decoder)
+
+ self.encoder = encoder
+ self.decoder = decoder
+
+ if self.encoder.config.to_dict() != self.config.encoder.to_dict():
+ logger.warning(
+ f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:"
+ f" {self.config.encoder}"
+ )
+ if self.decoder.config.to_dict() != self.config.decoder.to_dict():
+ logger.warning(
+ f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
+ f" {self.config.decoder}"
+ )
+
+ # make sure that the individual model's config refers to the shared config
+ # so that the updates to the config will be synced
+ # update `_attn_implementation` because the attn is set in a deepcopied config within PreTrainedModel
+ self.config.encoder._attn_implementation = self.encoder.config._attn_implementation
+ self.config.decoder._attn_implementation = self.decoder.config._attn_implementation
+ self.encoder.config = self.config.encoder
+ self.decoder.config = self.config.decoder
+
+ # encoder outputs might need to be projected to different dimension for decoder
+ if (
+ self.encoder.config.hidden_size != self.decoder.config.hidden_size
+ and self.decoder.config.cross_attention_hidden_size is None
+ ):
+ self.enc_to_dec_proj = nn.Linear(self.encoder.config.hidden_size, self.decoder.config.hidden_size)
+
+ if self.encoder.get_output_embeddings() is not None:
+ raise ValueError(
+ f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
+ )
+
+ decoder_signature = set(inspect.signature(self.decoder.forward).parameters.keys())
+ if "encoder_hidden_states" not in decoder_signature:
+ raise ValueError(
+ "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+ "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+ )
+
+ # tie encoder, decoder weights if config set accordingly
+ self.tie_weights()
+
+ def tie_weights(self):
+ # tie encoder & decoder if needed
+ if self.config.tie_encoder_decoder:
+ # tie encoder and decoder base model
+ decoder_base_model_prefix = self.decoder.base_model_prefix
+ tied_weights = self._tie_encoder_decoder_weights(
+ self.encoder,
+ self.decoder._modules[decoder_base_model_prefix],
+ self.decoder.base_model_prefix,
+ "encoder",
+ )
+ # Setting a dynamic variable instead of `_tied_weights_keys` because it's a class
+ # attributed not an instance member, therefore modifying it will modify the entire class
+ # Leading to issues on subsequent calls by different tests or subsequent calls.
+ self._dynamic_tied_weights_keys = tied_weights
+
+ def get_encoder(self):
+ return self.encoder
+
+ def get_decoder(self):
+ return self.decoder
+
+ def get_input_embeddings(self):
+ return self.encoder.get_input_embeddings()
+
+ def get_output_embeddings(self):
+ return self.decoder.get_output_embeddings()
+
+ def set_output_embeddings(self, new_embeddings):
+ return self.decoder.set_output_embeddings(new_embeddings)
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+ r"""
+ Example:
+
+ ```python
+ >>> from transformers import EncoderDecoderModel
+
+ >>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+ ```"""
+
+ from_tf = kwargs.pop("from_tf", False)
+ if from_tf:
+ from transformers import TFEncoderDecoderModel
+
+ # a workaround to load from tensorflow checkpoint
+ # Using `_tf_model` won't work, because the weight names in the encoder/decoder of `_tf_model` get
+ # extended before saving those components. For example, The name of `_tf_model.encoder.vit` is
+ # `[top model name]/encoder/vit`, but the name of `tf_model.encoder.vit` is `[top model name]/vit`. The
+ # [top model name] is handled (stripped) by the conversion method, and the former case gets extra `encoder`,
+ # which should not occur when we want to save the components alone.
+ # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see
+ # https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245
+ # (the change in `src/transformers/modeling_tf_utils.py`)
+ _tf_model = TFEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+ config = _tf_model.config
+
+ # Using `tf_model` instead
+ encoder = _tf_model.encoder.__class__(_tf_model.config.encoder)
+ decoder = _tf_model.decoder.__class__(_tf_model.config.decoder)
+ # Make sure models are built
+ encoder(encoder.dummy_inputs)
+ decoder(decoder.dummy_inputs)
+
+ # Get the variable correspondence between `_tf_model` and `encoder` and `decoder`
+ encoder_variables = {}
+ for v in encoder.trainable_variables + encoder.non_trainable_variables:
+ encoder_variables["/".join(v.name.split("/")[1:])] = v
+ decoder_variables = {}
+ for v in decoder.trainable_variables + decoder.non_trainable_variables:
+ decoder_variables["/".join(v.name.split("/")[1:])] = v
+
+ _encoder_variables = {}
+ for v in _tf_model.encoder.trainable_variables + _tf_model.encoder.non_trainable_variables:
+ _encoder_variables["/".join(v.name.split("/")[2:])] = v
+ _decoder_variables = {}
+ for v in _tf_model.decoder.trainable_variables + _tf_model.decoder.non_trainable_variables:
+ _decoder_variables["/".join(v.name.split("/")[2:])] = v
+
+ # assign weight values to `encoder` and `decoder` from `_tf_model`
+ for name, v in encoder_variables.items():
+ v.assign(_encoder_variables[name])
+ for name, v in decoder_variables.items():
+ v.assign(_decoder_variables[name])
+
+ tf_model = TFEncoderDecoderModel(encoder=encoder, decoder=decoder)
+
+ # Deal with `enc_to_dec_proj`
+ if hasattr(_tf_model, "enc_to_dec_proj"):
+ tf_model(tf_model.dummy_inputs)
+ tf_model.enc_to_dec_proj.kernel.assign(_tf_model.enc_to_dec_proj.kernel)
+ tf_model.enc_to_dec_proj.bias.assign(_tf_model.enc_to_dec_proj.bias)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ encoder_dir = os.path.join(tmpdirname, "encoder")
+ decoder_dir = os.path.join(tmpdirname, "decoder")
+ tf_model.encoder.save_pretrained(encoder_dir)
+ tf_model.decoder.save_pretrained(decoder_dir)
+
+ if hasattr(tf_model, "enc_to_dec_proj"):
+ enc_to_dec_proj_weight = torch.transpose(
+ torch.from_numpy(tf_model.enc_to_dec_proj.kernel.numpy()), 1, 0
+ )
+ enc_to_dec_proj_bias = torch.from_numpy(tf_model.enc_to_dec_proj.bias.numpy())
+
+ del _tf_model
+ del tf_model
+ gc.collect()
+
+ model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+ encoder_dir, decoder_dir, encoder_from_tf=True, decoder_from_tf=True
+ )
+ # This is only for copying some specific attributes of this particular model.
+ model.config = config
+
+ if hasattr(model, "enc_to_dec_proj"):
+ model.enc_to_dec_proj.weight.data = enc_to_dec_proj_weight.contiguous()
+ model.enc_to_dec_proj.bias.data = enc_to_dec_proj_bias.contiguous()
+
+ return model
+
+ # At the moment fast initialization is not supported for composite models
+ if kwargs.get("_fast_init", False):
+ logger.warning(
+ "Fast initialization is currently not supported for EncoderDecoderModel. "
+ "Falling back to slow initialization..."
+ )
+ kwargs["_fast_init"] = False
+
+ return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+ @classmethod
+ def from_encoder_decoder_pretrained(
+ cls,
+ encoder_pretrained_model_name_or_path: str = None,
+ decoder_pretrained_model_name_or_path: str = None,
+ *model_args,
+ **kwargs,
+ ) -> PreTrainedModel:
+ r"""
+ Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
+ checkpoints.
+
+
+ The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+ the model, you need to first set it back in training mode with `model.train()`.
+
+ Params:
+ encoder_pretrained_model_name_or_path (`str`, *optional*):
+ Information necessary to initiate the encoder. Can be either:
+
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided as
+ `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+ PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+ decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
+ Information necessary to initiate the decoder. Can be either:
+
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided as
+ `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+ PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+ model_args (remaining positional arguments, *optional*):
+ All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+ kwargs (remaining dictionary of keyword arguments, *optional*):
+ Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+ `output_attentions=True`).
+
+ - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+ - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+ - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+ Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+ Example:
+
+ ```python
+ >>> from transformers import EncoderDecoderModel
+
+ >>> # initialize a bert2bert from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
+ >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
+ >>> # saving model after fine-tuning
+ >>> model.save_pretrained("./bert2bert")
+ >>> # load fine-tuned model
+ >>> model = EncoderDecoderModel.from_pretrained("./bert2bert")
+ ```"""
+
+ kwargs_encoder = {
+ argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+ }
+
+ kwargs_decoder = {
+ argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+ }
+
+ # remove encoder, decoder kwargs from kwargs
+ for key in kwargs_encoder.keys():
+ del kwargs["encoder_" + key]
+ for key in kwargs_decoder.keys():
+ del kwargs["decoder_" + key]
+
+ # Load and initialize the encoder and decoder
+ # The distinction between encoder and decoder at the model level is made
+ # by the value of the flag `is_decoder` that we need to set correctly.
+ encoder = kwargs_encoder.pop("model", None)
+ if encoder is None:
+ if encoder_pretrained_model_name_or_path is None:
+ raise ValueError(
+ "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+ "to be defined."
+ )
+
+ if "config" not in kwargs_encoder:
+ encoder_config, kwargs_encoder = AutoConfig.from_pretrained(
+ encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
+ )
+
+ if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+ logger.info(
+ f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+ "from a decoder model. Cross-attention and casual mask are disabled."
+ )
+ encoder_config.is_decoder = False
+ encoder_config.add_cross_attention = False
+
+ kwargs_encoder["config"] = encoder_config
+
+ encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
+
+ decoder = kwargs_decoder.pop("model", None)
+ if decoder is None:
+ if decoder_pretrained_model_name_or_path is None:
+ raise ValueError(
+ "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+ "to be defined."
+ )
+
+ if "config" not in kwargs_decoder:
+ decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
+ decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+ )
+
+ if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+ logger.info(
+ f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+ f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+ f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+ )
+ decoder_config.is_decoder = True
+ decoder_config.add_cross_attention = True
+
+ kwargs_decoder["config"] = decoder_config
+
+ if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+ logger.warning(
+ f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+ f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+ "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+ "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+ "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+ )
+
+ decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+ # instantiate config with corresponding kwargs
+ config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
+ return cls(encoder=encoder, decoder=decoder, config=config)
+
+ @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ decoder_input_ids: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
+ encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
+ past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Tuple, Seq2SeqLMOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import EncoderDecoderModel, BertTokenizer
+ >>> import torch
+
+ >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+ >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+ ... "google-bert/bert-base-uncased", "google-bert/bert-base-uncased"
+ ... ) # initialize Bert2Bert from pre-trained checkpoints
+
+ >>> # training
+ >>> model.config.decoder_start_token_id = tokenizer.cls_token_id
+ >>> model.config.pad_token_id = tokenizer.pad_token_id
+ >>> model.config.vocab_size = model.config.decoder.vocab_size
+
+ >>> input_ids = tokenizer("This is a really long text", return_tensors="pt").input_ids
+ >>> labels = tokenizer("This is the corresponding summary", return_tensors="pt").input_ids
+ >>> outputs = model(input_ids=input_ids, labels=labels)
+ >>> loss, logits = outputs.loss, outputs.logits
+
+ >>> # save and load from pretrained
+ >>> model.save_pretrained("bert2bert")
+ >>> model = EncoderDecoderModel.from_pretrained("bert2bert")
+
+ >>> # generation
+ >>> generated = model.generate(input_ids)
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
+
+ kwargs_decoder = {
+ argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+ }
+
+ if encoder_outputs is None:
+ encoder_outputs = self.encoder(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ **kwargs_encoder,
+ )
+ elif isinstance(encoder_outputs, tuple):
+ encoder_outputs = BaseModelOutput(*encoder_outputs)
+
+ encoder_hidden_states = encoder_outputs[0]
+
+ # optionally project encoder_hidden_states
+ if (
+ self.encoder.config.hidden_size != self.decoder.config.hidden_size
+ and self.decoder.config.cross_attention_hidden_size is None
+ ):
+ encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+ if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
+ decoder_input_ids = shift_tokens_right(
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
+ )
+ if decoder_attention_mask is None:
+ decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
+
+ # Decode
+ decoder_outputs = self.decoder(
+ input_ids=decoder_input_ids,
+ attention_mask=decoder_attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=attention_mask,
+ inputs_embeds=decoder_inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ use_cache=use_cache,
+ past_key_values=past_key_values,
+ return_dict=return_dict,
+ **kwargs_decoder,
+ )
+
+ # Compute loss independent from decoder (as some shift the logits inside them)
+ loss = None
+ if labels is not None:
+ warnings.warn(DEPRECATION_WARNING, FutureWarning)
+ logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ if loss is not None:
+ return (loss,) + decoder_outputs + encoder_outputs
+ else:
+ return decoder_outputs + encoder_outputs
+
+ return Seq2SeqLMOutput(
+ loss=loss,
+ logits=decoder_outputs.logits,
+ past_key_values=decoder_outputs.past_key_values,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ )
+
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+ return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+ def resize_token_embeddings(self, *args, **kwargs):
+ raise NotImplementedError(
+ "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
+ " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
+ " model.decoder.resize_token_embeddings(...))"
+ )
+
+ def _reorder_cache(self, past_key_values, beam_idx):
+ # apply decoder cache reordering here
+ return self.decoder._reorder_cache(past_key_values, beam_idx)
+
+
+__all__ = ["EncoderDecoderModel"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdc589484cda1083f42ce96e67b12771f84b6af9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -0,0 +1,901 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classes to support Flax Encoder-Decoder architectures"""
+
+import os
+from typing import Optional, Tuple, Union
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput
+from ...modeling_flax_utils import FlaxPreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM
+from .configuration_encoder_decoder import EncoderDecoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "EncoderDecoderConfig"
+
+ENCODER_DECODER_START_DOCSTRING = r"""
+ This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
+ encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
+ [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`]
+ function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
+ generative task, like summarization.
+
+ The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
+ tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
+ Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+ Zhou, Wei Li, Peter J. Liu.
+
+ After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
+ (see the examples for more information).
+
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a Flax Linen
+ [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+ regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+ Parameters:
+ config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+ `jax.numpy.bfloat16` (on TPUs).
+
+ This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+ specified all the computation will be performed with the given `dtype`.
+
+ **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+ parameters.**
+
+ If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+ [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+ENCODER_DECODER_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Indices of decoder input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+ For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
+ created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
+ and prepending them with the `decoder_start_token_id`.
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+ be used by default.
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.encoder.max_position_embeddings - 1]`.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+ range `[0, config.decoder.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
+"""
+
+ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.encoder.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple.
+"""
+
+ENCODER_DECODER_DECODE_INPUTS_DOCSTRING = r"""
+ Args:
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Indices of decoder input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+
+ For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
+ created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
+ and prepending them with the `decoder_start_token_id`.
+ encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+ `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+ hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+ encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+ be used by default.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+ range `[0, config.decoder.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+ Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~utils.FlaxCausalLMOutputWithCrossAttentions`] instead of a
+ plain tuple.
+"""
+
+
+class FlaxEncoderDecoderModule(nn.Module):
+ config: EncoderDecoderConfig
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ encoder_config = self.config.encoder
+ decoder_config = self.config.decoder
+
+ # Copied from `modeling_hybrid_clip.py` with modifications.
+ from ...models.auto.modeling_flax_auto import FLAX_MODEL_FOR_CAUSAL_LM_MAPPING, FLAX_MODEL_MAPPING
+
+ encoder_module = FLAX_MODEL_MAPPING[encoder_config.__class__].module_class
+ decoder_module = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING[decoder_config.__class__].module_class
+
+ self.encoder = encoder_module(encoder_config, dtype=self.dtype)
+ self.decoder = decoder_module(decoder_config, dtype=self.dtype)
+
+ # encoder outputs might need to be projected to different dimension for decoder
+ if (
+ self.encoder.config.hidden_size != self.decoder.config.hidden_size
+ and self.decoder.config.cross_attention_hidden_size is None
+ ):
+ self.enc_to_dec_proj = nn.Dense(
+ self.decoder.config.hidden_size,
+ kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range),
+ dtype=self.dtype,
+ )
+ else:
+ self.enc_to_dec_proj = None
+
+ def _get_encoder_module(self):
+ return self.encoder
+
+ def _get_projection_module(self):
+ return self.enc_to_dec_proj
+
+ def _get_decoder_module(self):
+ return self.decoder
+
+ def __call__(
+ self,
+ input_ids,
+ attention_mask,
+ decoder_input_ids,
+ decoder_attention_mask,
+ position_ids,
+ decoder_position_ids,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ deterministic: bool = True,
+ ):
+ encoder_outputs = self.encoder(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ deterministic=deterministic,
+ )
+
+ encoder_hidden_states = encoder_outputs[0]
+
+ # optionally project encoder_hidden_states
+ if self.enc_to_dec_proj is not None:
+ encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+ decoder_outputs = self.decoder(
+ input_ids=decoder_input_ids,
+ attention_mask=decoder_attention_mask,
+ position_ids=decoder_position_ids,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ deterministic=deterministic,
+ )
+
+ if not return_dict:
+ return decoder_outputs + encoder_outputs
+
+ return FlaxSeq2SeqLMOutput(
+ logits=decoder_outputs.logits,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
+class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
+ r"""
+ [`FlaxEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with
+ the module (flax.nn.Module) of one of the base model classes of the library as encoder module and another one as
+ decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method for the
+ encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder.
+ """
+
+ config_class = EncoderDecoderConfig
+ base_model_prefix = "encoder_decoder"
+ module_class = FlaxEncoderDecoderModule
+
+ def __init__(
+ self,
+ config: EncoderDecoderConfig,
+ input_shape: Optional[Tuple] = None,
+ seed: int = 0,
+ dtype: jnp.dtype = jnp.float32,
+ _do_init: bool = True,
+ **kwargs,
+ ):
+ if input_shape is None:
+ input_shape = ((1, 1), (1, 1))
+
+ if not _do_init:
+ raise ValueError(
+ "`FlaxEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
+ )
+
+ if config.decoder.cross_attention_hidden_size is not None:
+ if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+ raise ValueError(
+ "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
+ f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+ f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
+ " `config.encoder.hidden_size`."
+ )
+
+ module = self.module_class(config=config, dtype=dtype, **kwargs)
+ super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+ encoder_input_shape, decoder_input_shape = input_shape
+
+ # init input tensors
+ input_ids = jnp.zeros(encoder_input_shape, dtype="i4")
+ attention_mask = jnp.ones_like(input_ids)
+ decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4")
+ decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+
+ batch_size, sequence_length = input_ids.shape
+ position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+ decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape
+ if not decoder_batch_size == batch_size:
+ raise ValueError(
+ f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder"
+ f" and {decoder_batch_size} for decoder."
+ )
+ decoder_position_ids = jnp.broadcast_to(
+ jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length)
+ )
+
+ params_rng, dropout_rng = jax.random.split(rng)
+ rngs = {"params": params_rng, "dropout": dropout_rng}
+
+ random_params = self.module.init(
+ rngs,
+ input_ids,
+ attention_mask,
+ decoder_input_ids,
+ decoder_attention_mask,
+ position_ids,
+ decoder_position_ids,
+ )["params"]
+
+ if params is not None:
+ random_params = flatten_dict(unfreeze(random_params))
+ params = flatten_dict(unfreeze(params))
+ for missing_key in self._missing_keys:
+ params[missing_key] = random_params[missing_key]
+ self._missing_keys = set()
+ return freeze(unflatten_dict(params))
+ else:
+ return random_params
+
+ def init_cache(self, batch_size, max_length, encoder_outputs):
+ r"""
+ Args:
+ batch_size (`int`):
+ batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+ max_length (`int`):
+ maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+ cache.
+ encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+ `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+ is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ cross-attention of the decoder.
+ """
+ # init input variables to retrieve cache
+ decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+ decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+ decoder_position_ids = jnp.broadcast_to(
+ jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
+ )
+
+ def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+ decoder_module = module._get_decoder_module()
+ return decoder_module(
+ input_ids=decoder_input_ids,
+ attention_mask=decoder_attention_mask,
+ position_ids=decoder_position_ids,
+ **kwargs,
+ )
+
+ init_variables = self.module.init(
+ jax.random.PRNGKey(0),
+ decoder_input_ids=decoder_input_ids,
+ decoder_attention_mask=decoder_attention_mask,
+ decoder_position_ids=decoder_position_ids,
+ encoder_hidden_states=encoder_outputs[0],
+ init_cache=True,
+ method=_decoder_forward, # we only need to call the decoder to init the cache
+ )
+ return unfreeze(init_variables["cache"])
+
+ @add_start_docstrings(ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+ def encode(
+ self,
+ input_ids: jnp.ndarray,
+ attention_mask: Optional[jnp.ndarray] = None,
+ position_ids: Optional[jnp.ndarray] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ train: bool = False,
+ params: dict = None,
+ dropout_rng: PRNGKey = None,
+ ):
+ r"""
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
+
+ >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+ >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
+
+ >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> input_ids = tokenizer.encode(text, return_tensors="np")
+ >>> encoder_outputs = model.encode(input_ids)
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ if attention_mask is None:
+ attention_mask = jnp.ones_like(input_ids)
+ if position_ids is None:
+ batch_size, sequence_length = input_ids.shape
+ position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+ # Handle any PRNG if needed
+ rngs = {}
+ if dropout_rng is not None:
+ rngs["dropout"] = dropout_rng
+
+ def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
+ encode_module = module._get_encoder_module()
+ return encode_module(input_ids, attention_mask, position_ids, **kwargs)
+
+ outputs = self.module.apply(
+ {"params": params or self.params},
+ input_ids=jnp.array(input_ids, dtype="i4"),
+ attention_mask=jnp.array(attention_mask, dtype="i4"),
+ position_ids=jnp.array(position_ids, dtype="i4"),
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ deterministic=not train,
+ rngs=rngs,
+ method=_encoder_forward,
+ )
+
+ if return_dict:
+ outputs = FlaxBaseModelOutput(
+ last_hidden_state=outputs.last_hidden_state,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ return outputs
+
+ @add_start_docstrings(ENCODER_DECODER_DECODE_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+ def decode(
+ self,
+ decoder_input_ids,
+ encoder_outputs,
+ encoder_attention_mask: Optional[jnp.ndarray] = None,
+ decoder_attention_mask: Optional[jnp.ndarray] = None,
+ decoder_position_ids: Optional[jnp.ndarray] = None,
+ past_key_values: dict = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ train: bool = False,
+ params: dict = None,
+ dropout_rng: PRNGKey = None,
+ ):
+ r"""
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
+ >>> import jax.numpy as jnp
+
+ >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+ >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
+
+ >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> input_ids = tokenizer.encode(text, max_length=1024, return_tensors="np")
+ >>> encoder_outputs = model.encode(input_ids)
+
+ >>> decoder_start_token_id = model.config.decoder.bos_token_id
+ >>> decoder_input_ids = jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> logits = outputs.logits
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ encoder_hidden_states = encoder_outputs[0]
+ if encoder_attention_mask is None:
+ batch_size, sequence_length = encoder_hidden_states.shape[:2]
+ encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+ batch_size, sequence_length = decoder_input_ids.shape
+ if decoder_attention_mask is None:
+ decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+ if decoder_position_ids is None:
+ if past_key_values is not None:
+ raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+
+ decoder_position_ids = jnp.broadcast_to(
+ jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+ )
+
+ # Handle any PRNG if needed
+ rngs = {}
+ if dropout_rng is not None:
+ rngs["dropout"] = dropout_rng
+
+ inputs = {"params": params or self.params}
+
+ # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+ # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+ # it can be changed by FlaxBartAttention module
+ if past_key_values:
+ inputs["cache"] = past_key_values
+ mutable = ["cache"]
+ else:
+ mutable = False
+
+ def _decoder_forward(
+ module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
+ ):
+ projection_module = module._get_projection_module()
+ decoder_module = module._get_decoder_module()
+
+ # optionally project encoder_hidden_states
+ if projection_module is not None:
+ encoder_hidden_states = projection_module(encoder_hidden_states)
+
+ return decoder_module(
+ decoder_input_ids,
+ decoder_attention_mask,
+ decoder_position_ids,
+ encoder_hidden_states=encoder_hidden_states,
+ **kwargs,
+ )
+
+ outputs = self.module.apply(
+ inputs,
+ decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+ decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+ decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ deterministic=not train,
+ rngs=rngs,
+ mutable=mutable,
+ method=_decoder_forward,
+ )
+
+ # add updated cache to model output
+ if past_key_values is not None and return_dict:
+ outputs, past = outputs
+ outputs["past_key_values"] = unfreeze(past["cache"])
+ return outputs
+ elif past_key_values is not None and not return_dict:
+ outputs, past = outputs
+ outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+
+ return outputs
+
+ @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+ def __call__(
+ self,
+ input_ids: jnp.ndarray,
+ attention_mask: Optional[jnp.ndarray] = None,
+ decoder_input_ids: Optional[jnp.ndarray] = None,
+ decoder_attention_mask: Optional[jnp.ndarray] = None,
+ position_ids: Optional[jnp.ndarray] = None,
+ decoder_position_ids: Optional[jnp.ndarray] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ train: bool = False,
+ params: dict = None,
+ dropout_rng: PRNGKey = None,
+ ):
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer, GPT2Tokenizer
+
+ >>> # load a fine-tuned bert2gpt2 model
+ >>> model = FlaxEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
+ >>> # load input & output tokenizer
+ >>> tokenizer_input = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+ >>> tokenizer_output = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+
+ >>> article = '''Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members
+ >>> singing a racist chant. SAE's national chapter suspended the students,
+ >>> but University of Oklahoma President David Boren took it a step further,
+ >>> saying the university's affiliation with the fraternity is permanently done.'''
+
+ >>> input_ids = tokenizer_input(article, add_special_tokens=True, return_tensors="np").input_ids
+
+ >>> # use GPT2's eos_token as the pad as well as eos token
+ >>> model.config.eos_token_id = model.config.decoder.eos_token_id
+ >>> model.config.pad_token_id = model.config.eos_token_id
+
+ >>> sequences = model.generate(input_ids, num_beams=4, max_length=12).sequences
+
+ >>> summary = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)[0]
+ >>> assert summary == "SAS Alpha Epsilon suspended Sigma Alpha Epsilon members"
+ ```
+ """
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ # prepare encoder inputs
+ if attention_mask is None:
+ attention_mask = jnp.ones_like(input_ids)
+ if position_ids is None:
+ batch_size, sequence_length = input_ids.shape
+ position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+ # prepare decoder inputs
+ if decoder_input_ids is None:
+ raise ValueError(
+ "`decoder_input_ids` cannot be `None`. For sequence to sequence training, `decoder_position_ids` must"
+ " be specified as an input argument."
+ )
+ if decoder_attention_mask is None:
+ decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+ if decoder_position_ids is None:
+ batch_size, sequence_length = decoder_input_ids.shape
+ decoder_position_ids = jnp.broadcast_to(
+ jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+ )
+
+ # Handle any PRNG if needed
+ rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+
+ return self.module.apply(
+ {"params": params or self.params},
+ input_ids=jnp.array(input_ids, dtype="i4"),
+ attention_mask=jnp.array(attention_mask, dtype="i4"),
+ decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+ decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+ position_ids=jnp.array(position_ids, dtype="i4"),
+ decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ deterministic=not train,
+ rngs=rngs,
+ )
+
+ def prepare_inputs_for_generation(
+ self,
+ decoder_input_ids,
+ max_length,
+ attention_mask: Optional[jax.Array] = None,
+ decoder_attention_mask: Optional[jax.Array] = None,
+ encoder_outputs=None,
+ **kwargs,
+ ):
+ # initializing the cache
+ batch_size, seq_length = decoder_input_ids.shape
+
+ past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+ # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+ # But since the decoder uses a causal mask, those positions are masked anyways.
+ # Thus we can create a single static attention_mask here, which is more efficient for compilation
+ extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+ if decoder_attention_mask is not None:
+ decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
+ extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+ else:
+ decoder_position_ids = jnp.broadcast_to(
+ jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)
+ )
+
+ return {
+ "past_key_values": past_key_values,
+ "encoder_outputs": encoder_outputs,
+ "encoder_attention_mask": attention_mask,
+ "decoder_attention_mask": extended_attention_mask,
+ "decoder_position_ids": decoder_position_ids,
+ }
+
+ def update_inputs_for_generation(self, model_outputs, model_kwargs):
+ model_kwargs["past_key_values"] = model_outputs.past_key_values
+ model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+ return model_kwargs
+
+ @classmethod
+ def from_encoder_decoder_pretrained(
+ cls,
+ encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+ decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+ *model_args,
+ **kwargs,
+ ) -> FlaxPreTrainedModel:
+ r"""
+ Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
+ checkpoints.
+
+ Params:
+ encoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*):
+ Information necessary to initiate the encoder. Can be either:
+
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ - A path to a *directory* containing model weights saved using
+ [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+ decoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*, defaults to `None`):
+ Information necessary to initiate the decoder. Can be either:
+
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ - A path to a *directory* containing model weights saved using
+ [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+ model_args (remaining positional arguments, *optional*):
+ All remaning positional arguments will be passed to the underlying model's `__init__` method.
+
+ kwargs (remaining dictionary of keyword arguments, *optional*):
+ Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+ `output_attentions=True`).
+
+ - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+ - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+ - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+ Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+ Example:
+
+ ```python
+ >>> from transformers import FlaxEncoderDecoderModel
+
+ >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+ >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
+ >>> # saving model after fine-tuning
+ >>> model.save_pretrained("./bert2gpt2")
+ >>> # load fine-tuned model
+ >>> model = FlaxEncoderDecoderModel.from_pretrained("./bert2gpt2")
+ ```"""
+
+ kwargs_encoder = {
+ argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+ }
+
+ kwargs_decoder = {
+ argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+ }
+
+ # remove encoder, decoder kwargs from kwargs
+ for key in kwargs_encoder.keys():
+ del kwargs["encoder_" + key]
+ for key in kwargs_decoder.keys():
+ del kwargs["decoder_" + key]
+
+ # Load and initialize the encoder and decoder
+ # The distinction between encoder and decoder at the model level is made
+ # by the value of the flag `is_decoder` that we need to set correctly.
+ encoder = kwargs_encoder.pop("model", None)
+ if encoder is None:
+ if encoder_pretrained_model_name_or_path is None:
+ raise ValueError(
+ "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+ "to be defined."
+ )
+
+ if "config" not in kwargs_encoder:
+ encoder_config, kwargs_encoder = AutoConfig.from_pretrained(
+ encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
+ )
+ if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+ logger.info(
+ f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+ "from a decoder model. Cross-attention and casual mask are disabled."
+ )
+ encoder_config.is_decoder = False
+ encoder_config.add_cross_attention = False
+
+ kwargs_encoder["config"] = encoder_config
+
+ encoder = FlaxAutoModel.from_pretrained(
+ encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
+ )
+
+ decoder = kwargs_decoder.pop("model", None)
+ if decoder is None:
+ if decoder_pretrained_model_name_or_path is None:
+ raise ValueError(
+ "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+ "to be defined."
+ )
+
+ if "config" not in kwargs_decoder:
+ decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
+ decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+ )
+ if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+ logger.info(
+ f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+ f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+ f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+ )
+ decoder_config.is_decoder = True
+ decoder_config.add_cross_attention = True
+
+ kwargs_decoder["config"] = decoder_config
+
+ if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+ logger.warning(
+ f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+ f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+ "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+ "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+ "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+ )
+
+ decoder = FlaxAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+ # instantiate config with corresponding kwargs
+ dtype = kwargs.pop("dtype", jnp.float32)
+ config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
+
+ # init model
+ model = cls(config, dtype=dtype)
+ model.params["encoder"] = encoder.params
+ model.params["decoder"] = decoder.params
+
+ return model
+
+
+__all__ = ["FlaxEncoderDecoderModel"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/mpt/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/mpt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..20bf8c5ebaf9871204c64eedc5990498ecff1ddc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/mpt/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_mpt import *
+ from .modeling_mpt import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/mpt/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/mpt/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecbef324b00f9a7144760ecba61cf2a2ad72d1c5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/mpt/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/mpt/__pycache__/configuration_mpt.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/mpt/__pycache__/configuration_mpt.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50340015d302e17417f452d095c18f73e50811f7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/mpt/__pycache__/configuration_mpt.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/mpt/__pycache__/modeling_mpt.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/mpt/__pycache__/modeling_mpt.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7206ca5f5fb1c9d437a77f56834128b617e3f234
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/mpt/__pycache__/modeling_mpt.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/mpt/configuration_mpt.py b/.venv/lib/python3.11/site-packages/transformers/models/mpt/configuration_mpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5078bd9bfec8f1afd08ffaae1b629cf34fa71fa
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/mpt/configuration_mpt.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mpt configuration"""
+
+from typing import TYPE_CHECKING, Optional, Union
+
+
+if TYPE_CHECKING:
+ pass
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MptAttentionConfig(PretrainedConfig):
+ """
+ This is the configuration class to store the configuration of a [`MptAttention`] class. It is used to instantiate
+ attention layers according to the specified arguments, defining the layers architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the MPT
+ [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b) architecture. Most of the arguments are kept for backward
+ compatibility with previous MPT models that are hosted on the Hub (previously with `trust_remote_code=True`).
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
+ type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
+ attn_pdrop (`float`, *optional*, defaults to `0.0`):
+ The dropout probability for the attention layers.
+ attn_impl (`str`, *optional*, defaults to `"torch"`):
+ The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
+ clip_qkv (`float`, *optional*):
+ If not `None`, clip the queries, keys, and values in the attention layer to this value.
+ softmax_scale (`float`, *optional*):
+ If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
+ `1/sqrt(hidden_size)`.
+ prefix_lm (`bool`, *optional*, defaults to `False`):
+ Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
+ which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
+ bi-directionally. Tokens outside the prefix use causal attention.
+ qk_ln (`bool`, *optional*, defaults to `False`):
+ Whether to apply layer normalization to the queries and keys in the attention layer.
+ attn_uses_sequence_id (`bool`, *optional*, defaults to `False`):
+ Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
+ mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
+ token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
+ alibi (`bool`, *optional*, defaults to `True`):
+ Whether or not to use the alibi bias instead of positional embedding.
+ alibi_bias_max (`int`, *optional*, defaults to 8):
+ The maximum value of the alibi bias.
+ """
+
+ base_config_key = "attn_config"
+
+ def __init__(
+ self,
+ attn_type="multihead_attention",
+ attn_pdrop=0,
+ attn_impl="torch",
+ clip_qkv=None,
+ softmax_scale=None,
+ prefix_lm=False,
+ qk_ln=False,
+ attn_uses_sequence_id=False,
+ alibi=True,
+ alibi_bias_max=8,
+ **kwargs,
+ ):
+ super().__init__()
+ self.attn_type = attn_type
+ self.attn_pdrop = attn_pdrop
+ self.attn_impl = attn_impl
+ self.clip_qkv = clip_qkv
+ self.softmax_scale = softmax_scale
+ self.prefix_lm = prefix_lm
+ self.attn_uses_sequence_id = attn_uses_sequence_id
+ self.alibi = alibi
+ self.qk_ln = qk_ln
+ self.alibi_bias_max = alibi_bias_max
+
+ if attn_type not in ["multihead_attention", "multiquery_attention"]:
+ raise ValueError(
+ f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
+ )
+
+
+class MptConfig(PretrainedConfig):
+ """
+ This is the configuration class to store the configuration of a [`MptModel`]. It is used to instantiate a Mpt model
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to the Mpt-7b architecture
+ [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b).
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ d_model (`int`, *optional*, defaults to 2048):
+ Dimensionality of the embeddings and hidden states.
+ n_heads (`int`, *optional*, defaults to 16):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ n_layers (`int`, *optional*, defaults to 24):
+ Number of hidden layers in the Transformer encoder.
+ expansion_ratio (`int`, *optional*, defaults to 4):
+ The ratio of the up/down scale in the MLP.
+ max_seq_len (`int`, *optional*, defaults to 2048):
+ The maximum sequence length of the model.
+ vocab_size (`int`, *optional*, defaults to 50368):
+ Vocabulary size of the Mpt model. Defines the maximum number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`MptModel`]. Check [this
+ discussion](https://huggingface.co/bigscience/mpt/discussions/120#633d28389addb8530b406c2a) on how the
+ `vocab_size` has been defined.
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
+ The dropout probability applied to the attention output before combining with residual.
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+ The epsilon to use in the layer normalization layers.
+ emb_pdrop (`float`, *optional*, defaults to 0.0):
+ The dropout probability for the embedding layer.
+ learned_pos_emb (`bool`, *optional*, defaults to `True`):
+ Whether to use learned positional embeddings.
+ attn_config (`dict`, *optional*):
+ A dictionary used to configure the model's attention module.
+ init_device (`str`, *optional*, defaults to `"cpu"`):
+ The device to use for parameter initialization. Defined for backward compatibility
+ logit_scale (`float`, *optional*):
+ If not None, scale the logits by this value.
+ no_bias (`bool`, *optional*, defaults to `True`):
+ Whether to use bias in all linear layers.
+ verbose (`int`, *optional*, defaults to 0):
+ The verbosity level to use for logging. Used in the previous versions of MPT models for logging. This
+ argument is deprecated.
+ embedding_fraction (`float`, *optional*, defaults to 1.0):
+ The fraction to scale the gradients of the embedding layer by.
+ norm_type (`str`, *optional*, defaults to `"low_precision_layernorm"`):
+ Type of layer norm to use. All MPT models uses the same layer norm implementation. Defined for backward
+ compatibility.
+ use_cache (`bool`, *optional*, defaults to `False`):
+ Whether or not the model should return the last key/values attentions (not used by all models).
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+ Example:
+
+ ```python
+ >>> from transformers import MptConfig, MptModel
+
+ >>> # Initializing a Mpt configuration
+ >>> configuration = MptConfig()
+
+ >>> # Initializing a model (with random weights) from the configuration
+ >>> model = MptModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```
+ """
+
+ model_type = "mpt"
+ sub_configs = {"attn_config": MptAttentionConfig}
+ attribute_map = {
+ "num_attention_heads": "n_heads",
+ "hidden_size": "d_model",
+ "num_hidden_layers": "n_layers",
+ }
+
+ def __init__(
+ self,
+ d_model: int = 2048,
+ n_heads: int = 16,
+ n_layers: int = 24,
+ expansion_ratio: int = 4,
+ max_seq_len: int = 2048,
+ vocab_size: int = 50368,
+ resid_pdrop: float = 0.0,
+ layer_norm_epsilon: float = 1e-5,
+ emb_pdrop: float = 0.0,
+ learned_pos_emb: bool = True,
+ attn_config: MptAttentionConfig = None,
+ init_device: str = "cpu",
+ logit_scale: Optional[Union[float, str]] = None,
+ no_bias: bool = True,
+ verbose: int = 0,
+ embedding_fraction: float = 1.0,
+ norm_type: str = "low_precision_layernorm",
+ use_cache: bool = False,
+ initializer_range=0.02,
+ **kwargs,
+ ):
+ if attn_config is None:
+ self.attn_config = MptAttentionConfig()
+ elif isinstance(attn_config, dict):
+ self.attn_config = MptAttentionConfig(**attn_config)
+ else:
+ self.attn_config = attn_config
+ self.d_model = d_model
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.expansion_ratio = expansion_ratio
+ self.max_seq_len = max_seq_len
+ self.vocab_size = vocab_size
+ self.resid_pdrop = resid_pdrop
+ self.emb_pdrop = emb_pdrop
+ self.learned_pos_emb = learned_pos_emb
+ self.init_device = init_device
+ self.logit_scale = logit_scale
+ self.no_bias = no_bias
+ self.verbose = verbose
+ self.embedding_fraction = embedding_fraction
+ self.norm_type = norm_type
+ self.layer_norm_epsilon = layer_norm_epsilon
+ self.use_cache = use_cache
+ self.initializer_range = initializer_range
+ super().__init__(**kwargs)
+
+
+__all__ = ["MptConfig"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/mpt/modeling_mpt.py b/.venv/lib/python3.11/site-packages/transformers/models/mpt/modeling_mpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c916e41c7667bdade365febd54b7c31fd7fae4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/mpt/modeling_mpt.py
@@ -0,0 +1,917 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MPT model."""
+
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import functional as F
+
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ...modeling_outputs import (
+ BaseModelOutputWithPastAndCrossAttentions,
+ CausalLMOutputWithCrossAttentions,
+ QuestionAnsweringModelOutput,
+ SequenceClassifierOutputWithPast,
+ TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_mpt import MptConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "mosaicml/mpt-7b"
+_CONFIG_FOR_DOC = "MptConfig"
+
+
+def build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max=8, device=None):
+ r"""
+ Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
+ relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
+ the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
+ https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
+ """
+ alibi = torch.arange(1 - sequence_length, 1, dtype=torch.int32, device=device).view(1, 1, 1, sequence_length)
+ num_heads_power_of_2 = 2 ** math.ceil(math.log2(num_heads))
+
+ base = torch.arange(1, num_heads_power_of_2 + 1, dtype=torch.int64, device=device).float()
+ base = base * (alibi_bias_max / num_heads_power_of_2)
+
+ slopes = 1.0 / torch.pow(2, base)
+ slopes = slopes.view(1, num_heads_power_of_2, 1, 1)
+
+ if num_heads_power_of_2 != num_heads:
+ slopes = torch.concat([slopes[:, 1::2, ...], slopes[:, ::2, ...]], dim=1)[:, :num_heads, ...]
+
+ alibi = alibi * slopes
+ return alibi.squeeze(0)
+
+
+class MptAttention(nn.Module):
+ """Multi-head self attention.
+ Using torch or triton attention implemetation enables user to also use additive bias.
+ """
+
+ def __init__(self, config: MptConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.n_heads = config.n_heads
+ self.max_seq_length = config.max_seq_len
+ self.head_dim = self.hidden_size // self.n_heads
+ self.softmax_scale = config.attn_config.softmax_scale
+ if self.softmax_scale is None:
+ self.softmax_scale = 1 / math.sqrt(self.hidden_size / self.n_heads)
+
+ self.attn_dropout_p = config.attn_config.attn_pdrop
+ self.clip_qkv = config.attn_config.clip_qkv
+ self.Wqkv = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
+ self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_bias: torch.Tensor,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ ):
+ batch_size, seq_length = hidden_states.shape[:2]
+
+ mixed_qkv = self.Wqkv(hidden_states)
+ if self.clip_qkv:
+ mixed_qkv = mixed_qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+ query_states, key_states, value_states = mixed_qkv.chunk(3, dim=2)
+ query_states = query_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+
+ if past_key_value is not None:
+ if len(past_key_value) != 0:
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+ past_key_value = (key_states, value_states)
+ else:
+ past_key_value = (key_states, value_states)
+
+ attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale
+
+ query_length = seq_length if past_key_value is None else seq_length + past_key_value[0].shape[2]
+
+ if position_bias is not None:
+ if len(position_bias.shape) != 3:
+ raise ValueError(f"Expecting position_bias shape to be 3 dimensions, got {len(position_bias.shape)}")
+ key_length = key_states.shape[-2]
+
+ position_bias_query_index = max(0, position_bias.size(1) - query_length)
+ position_bias_key_index = max(0, position_bias.size(2) - key_length)
+
+ position_bias = position_bias[:, position_bias_query_index:, position_bias_key_index:]
+
+ attention_scores = attention_scores + position_bias
+
+ if attention_mask is not None:
+ attention_scores = attention_scores.masked_fill(attention_mask, torch.finfo(query_states.dtype).min)
+
+ # (batch_size, n_heads, seq_length, key_length)
+ attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).to(value_states.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attn_dropout_p, training=self.training)
+
+ context_states = torch.matmul(attn_weights, value_states)
+ context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+ attn_output = self.out_proj(context_states)
+
+ return attn_output, attn_weights, past_key_value
+
+
+class MptMLP(nn.Module):
+ def __init__(self, config: MptConfig):
+ super().__init__()
+ hidden_size = config.hidden_size
+
+ self.up_proj = nn.Linear(hidden_size, 4 * hidden_size, bias=False)
+ self.act = nn.GELU(approximate="none")
+ self.down_proj = nn.Linear(4 * hidden_size, hidden_size, bias=False)
+ self.hidden_dropout = config.attn_config.attn_pdrop
+
+ def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.act(self.up_proj(hidden_states))
+
+ intermediate_output = self.down_proj(hidden_states)
+
+ output = F.dropout(intermediate_output, p=self.hidden_dropout, training=self.training)
+ output = output + residual
+
+ return output
+
+
+class MptBlock(nn.Module):
+ def __init__(self, config: MptConfig):
+ super().__init__()
+ hidden_size = config.hidden_size
+
+ self.norm_1 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+ # backward compatibility with weights on the Hub
+ self.norm_1.bias = None
+
+ self.num_heads = config.n_heads
+ self.attn = MptAttention(config)
+
+ self.norm_2 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+ # backward compatibility with weights on the Hub
+ self.norm_2.bias = None
+
+ self.ffn = MptMLP(config)
+
+ self.dropout_rate = config.attn_config.attn_pdrop
+ self.resid_attn_dropout = nn.Dropout(self.dropout_rate)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_bias: torch.Tensor,
+ attention_mask: torch.Tensor,
+ layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+ use_cache: bool = False,
+ output_attentions: bool = False,
+ ):
+ # hidden_states: [batch_size, seq_length, hidden_size]
+ # Layer norm at the beginning of the transformer layer.
+ layernorm_output = self.norm_1(hidden_states)
+
+ residual = hidden_states
+
+ # Self attention.
+ attn_outputs, attn_weights, past_key_value = self.attn(
+ layernorm_output,
+ position_bias=position_bias,
+ attention_mask=attention_mask,
+ past_key_value=layer_past,
+ )
+
+ hidden_states = self.resid_attn_dropout(attn_outputs) + residual
+
+ layernorm_output = self.norm_2(hidden_states)
+
+ # Get residual
+ residual = hidden_states
+
+ # MLP.
+ output = self.ffn(layernorm_output, residual)
+ outputs = (output,)
+
+ if use_cache:
+ outputs += (past_key_value,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs # hidden_states, present, attentions
+
+
+class MptPreTrainedModel(PreTrainedModel):
+ config_class = MptConfig
+ base_model_prefix = "transformer"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["MptBlock"]
+ _keys_to_ignore_on_load_missing = [r"lm_head.*."]
+
+ def __init__(self, *inputs, **kwargs):
+ super().__init__(*inputs, **kwargs)
+
+ def _init_weights(self, module: nn.Module):
+ """Initialize the weights."""
+ if isinstance(module, nn.Linear):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, LayerNorm):
+ if module.bias is not None:
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+ @staticmethod
+ def _convert_to_mpt_cache(
+ past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
+ ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+ """
+ Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
+ """
+ batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
+ batch_size_times_num_heads = batch_size * num_heads
+ # key: [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
+ # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
+ return tuple(
+ (
+ layer_past[0].reshape(batch_size_times_num_heads, head_dim, seq_length),
+ layer_past[1].reshape(batch_size_times_num_heads, seq_length, head_dim),
+ )
+ for layer_past in past_key_value
+ )
+
+
+MPT_START_DOCSTRING = r"""
+
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`MptConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MPT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+ `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
+ (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+ If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+ `input_ids`.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
+ Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+ `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+ their past given to this model should not be passed as `input_ids` as they have already been computed.
+
+ Each element of `past_key_values` is a tuple (past_key, past_value):
+ - past_key: [batch_size * num_heads, head_dim, kv_length]
+ - past_value: [batch_size * num_heads, kv_length, head_dim]
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+
+ If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+ `past_key_values`).
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.",
+ MPT_START_DOCSTRING,
+)
+class MptModel(MptPreTrainedModel):
+ def __init__(self, config: MptConfig):
+ super().__init__(config)
+
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.n_heads
+
+ # Embedding + LN Embedding
+ self.wte = nn.Embedding(config.vocab_size, self.hidden_size)
+
+ # Transformer blocks
+ self.blocks = nn.ModuleList([MptBlock(config) for _ in range(config.n_layers)])
+
+ # Final Layer Norm
+ self.norm_f = LayerNorm(self.hidden_size, eps=config.layer_norm_epsilon)
+ # backward compatibility with weights on the Hub
+ self.norm_f.bias = None
+
+ self.gradient_checkpointing = False
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.wte
+
+ def build_mpt_alibi_tensor(self, num_heads, sequence_length, alibi_bias_max=8, device=None):
+ return build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max, device)
+
+ def set_input_embeddings(self, new_embeddings: torch.Tensor):
+ self.wte = new_embeddings
+
+ @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutputWithPastAndCrossAttentions,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs, # NOOP kwargs, for now
+ ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape
+ elif inputs_embeds is not None:
+ batch_size, seq_length, _ = inputs_embeds.shape
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if past_key_values is None:
+ past_key_values = tuple([None] * len(self.blocks))
+
+ if inputs_embeds is None:
+ inputs_embeds = self.wte(input_ids)
+
+ hidden_states = inputs_embeds
+
+ presents = () if use_cache else None
+ all_self_attentions = () if output_attentions else None
+ all_hidden_states = () if output_hidden_states else None
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # Compute alibi tensor: check build_alibi_tensor documentation
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+ if past_key_values[0] is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+ if attention_mask is None:
+ attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
+ else:
+ attention_mask = attention_mask.to(hidden_states.device)
+
+ alibi = self.build_mpt_alibi_tensor(self.num_heads, self.config.max_seq_len, device=hidden_states.device)
+
+ causal_mask = _prepare_4d_causal_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+ causal_mask = causal_mask.bool()
+
+ for block, layer_past in zip(self.blocks, past_key_values):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ outputs = self._gradient_checkpointing_func(
+ block.__call__,
+ hidden_states,
+ alibi,
+ causal_mask,
+ layer_past,
+ use_cache,
+ output_attentions,
+ )
+ else:
+ outputs = block(
+ hidden_states,
+ layer_past=layer_past,
+ attention_mask=causal_mask,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ position_bias=alibi,
+ )
+
+ hidden_states = outputs[0]
+ if use_cache is True:
+ presents = presents + (outputs[1],)
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+ # Add last hidden state
+ hidden_states = self.norm_f(hidden_states)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+ return BaseModelOutputWithPastAndCrossAttentions(
+ last_hidden_state=hidden_states,
+ past_key_values=presents,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
+ embeddings).
+ """,
+ MPT_START_DOCSTRING,
+)
+class MptForCausalLM(MptPreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config: MptConfig):
+ super().__init__(config)
+ self.transformer = MptModel(config)
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings: torch.Tensor):
+ self.lm_head = new_embeddings
+
+ @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=CausalLMOutputWithCrossAttentions,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+ are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.transformer(
+ input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+
+ lm_logits = self.lm_head(hidden_states)
+
+ loss = None
+ if labels is not None:
+ # move labels to correct device to enable model parallelism
+ labels = labels.to(lm_logits.device)
+ # Flatten the tokens
+ loss = self.loss_function(
+ lm_logits,
+ labels,
+ vocab_size=self.config.vocab_size,
+ **kwargs,
+ )
+
+ if not return_dict:
+ output = (lm_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return CausalLMOutputWithCrossAttentions(
+ loss=loss,
+ logits=lm_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
+
+ def _reorder_cache(
+ self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+ ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+ """
+ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+ [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+ beam_idx at every generation step.
+
+ Output shares the same memory storage as `past`.
+ """
+ # Get a copy of `beam_idx` on all the devices where we need those indices.
+ device_to_beam_idx = {
+ past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
+ }
+ reordered_past = tuple(
+ (
+ layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
+ layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
+ )
+ for layer_past in past
+ )
+ return reordered_past
+
+
+@add_start_docstrings(
+ """
+ The MPT Model transformer with a sequence classification head on top (linear layer).
+
+ [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+ (e.g. GPT-1) do.
+
+ Since it does classification on the last token, it requires to know the position of the last token. If a
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+ each row of the batch).
+ """,
+ MPT_START_DOCSTRING,
+)
+class MptForSequenceClassification(MptPreTrainedModel):
+ def __init__(self, config: MptConfig):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.transformer = MptModel(config)
+ self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=SequenceClassifierOutputWithPast,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.transformer(
+ input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
+ sequence_lengths = sequence_lengths.to(logits.device)
+ else:
+ sequence_lengths = -1
+ logger.warning_once(
+ f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+ "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+ )
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ MPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+ Named-Entity-Recognition (NER) tasks.
+ """,
+ MPT_START_DOCSTRING,
+)
+class MptForTokenClassification(MptPreTrainedModel):
+ def __init__(self, config: MptConfig):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+
+ self.transformer = MptModel(config)
+ if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+ classifier_dropout = config.classifier_dropout
+ elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+ classifier_dropout = config.hidden_dropout
+ else:
+ classifier_dropout = 0.1
+ self.dropout = nn.Dropout(classifier_dropout)
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TokenClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **deprecated_arguments,
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.transformer(
+ input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = transformer_outputs[0]
+ hidden_states = self.dropout(hidden_states)
+ logits = self.classifier(hidden_states)
+
+ loss = None
+ if labels is not None:
+ # move labels to correct device to enable model parallelism
+ labels = labels.to(logits.device)
+ batch_size, seq_length = labels.shape
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(
+ logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+ )
+
+ if not return_dict:
+ output = (logits,) + transformer_outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TokenClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ The MPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
+ (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ """,
+ MPT_START_DOCSTRING,
+)
+class MptForQuestionAnswering(MptPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.transformer = MptModel(config)
+ self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ start_positions: Optional[torch.LongTensor] = None,
+ end_positions: Optional[torch.LongTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+ r"""
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.transformer(
+ input_ids,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+
+ logits = self.qa_outputs(sequence_output)
+ start_logits, end_logits = logits.split(1, dim=-1)
+ start_logits = start_logits.squeeze(-1).contiguous()
+ end_logits = end_logits.squeeze(-1).contiguous()
+
+ total_loss = None
+ if start_positions is not None and end_positions is not None:
+ # If we are on multi-GPU, split add a dimension
+ if len(start_positions.size()) > 1:
+ start_positions = start_positions.squeeze(-1)
+ if len(end_positions.size()) > 1:
+ end_positions = end_positions.squeeze(-1)
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
+ ignored_index = start_logits.size(1)
+ start_positions = start_positions.clamp(0, ignored_index)
+ end_positions = end_positions.clamp(0, ignored_index)
+
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+ start_loss = loss_fct(start_logits, start_positions)
+ end_loss = loss_fct(end_logits, end_positions)
+ total_loss = (start_loss + end_loss) / 2
+
+ if not return_dict:
+ output = (start_logits, end_logits) + outputs[2:]
+ return ((total_loss,) + output) if total_loss is not None else output
+
+ return QuestionAnsweringModelOutput(
+ loss=total_loss,
+ start_logits=start_logits,
+ end_logits=end_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = [
+ "MptForCausalLM",
+ "MptModel",
+ "MptPreTrainedModel",
+ "MptForSequenceClassification",
+ "MptForTokenClassification",
+ "MptForQuestionAnswering",
+]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/olmo/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94350cd33104797e7103fa0ef39a7c35c190f8f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__init__.py
@@ -0,0 +1,59 @@
+# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ _LazyModule,
+ is_sentencepiece_available,
+ is_tokenizers_available,
+ is_torch_available,
+)
+
+
+_import_structure = {
+ "configuration_olmo": ["OlmoConfig"],
+}
+
+try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["modeling_olmo"] = [
+ "OlmoForCausalLM",
+ "OlmoModel",
+ "OlmoPreTrainedModel",
+ ]
+
+if TYPE_CHECKING:
+ from .configuration_olmo import OlmoConfig
+
+ try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .modeling_olmo import (
+ OlmoForCausalLM,
+ OlmoModel,
+ OlmoPreTrainedModel,
+ )
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8c631471002b3872f18d1b151be48299d1ec763
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0c97fc4e9fdbf628de8f8f672e687f130652f33
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa570b83012ada7b699566356ed97c1c46e809ea
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bccbfb5436668eabefee80ce92058cc0606cb7f6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/olmo/configuration_olmo.py b/.venv/lib/python3.11/site-packages/transformers/models/olmo/configuration_olmo.py
new file mode 100644
index 0000000000000000000000000000000000000000..77a3b18e364ecf3af3f60d02bd7ef77ebb3de829
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/olmo/configuration_olmo.py
@@ -0,0 +1,181 @@
+# coding=utf-8
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""OLMo model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class OlmoConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`OlmoModel`]. It is used to instantiate an OLMo
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the [allenai/OLMo-7B-hf](https://huggingface.co/allenai/OLMo-7B-hf).
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 50304):
+ Vocabulary size of the OLMo model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`OlmoModel`]
+ hidden_size (`int`, *optional*, defaults to 4096):
+ Dimension of the hidden representations.
+ intermediate_size (`int`, *optional*, defaults to 11008):
+ Dimension of the MLP representations.
+ num_hidden_layers (`int`, *optional*, defaults to 32):
+ Number of hidden layers in the Transformer decoder.
+ num_attention_heads (`int`, *optional*, defaults to 32):
+ Number of attention heads for each attention layer in the Transformer decoder.
+ num_key_value_heads (`int`, *optional*):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+ `num_attention_heads`.
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
+ The maximum sequence length that this model might ever be used with.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ pad_token_id (`int`, *optional*, defaults to 1):
+ Padding token id.
+ bos_token_id (`int`, *optional*):
+ Beginning of stream token id.
+ eos_token_id (`int`, *optional*, defaults to 50279):
+ End of stream token id.
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+ Whether to tie weight embeddings
+ rope_theta (`float`, *optional*, defaults to 10000.0):
+ The base period of the RoPE embeddings.
+ rope_scaling (`Dict`, *optional*):
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+ these scaling strategies behave:
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+ experimental feature, subject to breaking API changes in future versions.
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ clip_qkv (`float`, *optional*):
+ If not `None`, elements of query, key and value attention states are clipped so that their
+ absolute value does not exceed this value.
+
+ ```python
+ >>> from transformers import OlmoModel, OlmoConfig
+
+ >>> # Initializing a OLMo 7B style configuration
+ >>> configuration = OlmoConfig()
+
+ >>> # Initializing a model from the OLMo 7B style configuration
+ >>> model = OlmoModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "olmo"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ vocab_size=50304,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ use_cache=True,
+ pad_token_id=1,
+ bos_token_id=None,
+ eos_token_id=50279,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ attention_bias=False,
+ attention_dropout=0.0,
+ clip_qkv=None,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+
+ # for backward compatibility
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+ self.attention_bias = attention_bias
+ self.attention_dropout = attention_dropout
+ self.clip_qkv = clip_qkv
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/olmo/modeling_olmo.py b/.venv/lib/python3.11/site-packages/transformers/models/olmo/modeling_olmo.py
new file mode 100644
index 0000000000000000000000000000000000000000..34e9f7259cdfff1299f2da9f330779879fde0032
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/olmo/modeling_olmo.py
@@ -0,0 +1,842 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/olmo/modular_olmo.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_olmo.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+ LossKwargs,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_olmo import OlmoConfig
+
+
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "OlmoConfig"
+
+
+class OlmoLayerNorm(nn.Module):
+ """LayerNorm but with no learnable weight or bias."""
+
+ def __init__(self, hidden_size: int) -> None:
+ super().__init__()
+ self.normalized_shape = (hidden_size,)
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ orig_dtype = hidden_states.dtype
+ return F.layer_norm(hidden_states.to(dtype=torch.float32), self.normalized_shape, None, None, eps=1e-5).to(
+ orig_dtype
+ )
+
+
+class OlmoMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+ return down_proj
+
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`, *optional*):
+ Deprecated and unused.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+ module: nn.Module,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ value: torch.Tensor,
+ attention_mask: Optional[torch.Tensor],
+ scaling: float,
+ dropout: float = 0.0,
+ **kwargs,
+):
+ key_states = repeat_kv(key, module.num_key_value_groups)
+ value_states = repeat_kv(value, module.num_key_value_groups)
+
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+ if attention_mask is not None:
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+ attn_weights = attn_weights + causal_mask
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ return attn_output, attn_weights
+
+
+class OlmoAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: OlmoConfig, layer_idx: int):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+ self.scaling = self.head_dim**-0.5
+ self.attention_dropout = config.attention_dropout
+ self.is_causal = True
+
+ self.q_proj = nn.Linear(
+ config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+ )
+ self.k_proj = nn.Linear(
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+ )
+ self.v_proj = nn.Linear(
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+ )
+ self.o_proj = nn.Linear(
+ config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+ )
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+ attention_mask: Optional[torch.Tensor],
+ past_key_value: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ input_shape = hidden_states.shape[:-1]
+ hidden_shape = (*input_shape, -1, self.head_dim)
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ if self.config.clip_qkv is not None:
+ query_states.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+ key_states.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+ value_states.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+
+ query_states = query_states.view(hidden_shape).transpose(1, 2)
+ key_states = key_states.view(hidden_shape).transpose(1, 2)
+ value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+ cos, sin = position_embeddings
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ attention_interface: Callable = eager_attention_forward
+ if self.config._attn_implementation != "eager":
+ if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+ logger.warning_once(
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ else:
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+ attn_output, attn_weights = attention_interface(
+ self,
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ dropout=0.0 if not self.training else self.attention_dropout,
+ scaling=self.scaling,
+ **kwargs,
+ )
+
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+ attn_output = self.o_proj(attn_output)
+ return attn_output, attn_weights
+
+
+class OlmoDecoderLayer(nn.Module):
+ def __init__(self, config: OlmoConfig, layer_idx: int):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = OlmoAttention(config=config, layer_idx=layer_idx)
+
+ self.mlp = OlmoMLP(config)
+ self.input_layernorm = OlmoLayerNorm(config.hidden_size)
+ self.post_attention_layernorm = OlmoLayerNorm(config.hidden_size)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
+ **kwargs: Unpack[FlashAttentionKwargs],
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ position_embeddings=position_embeddings,
+ **kwargs,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ return outputs
+
+
+class OlmoRotaryEmbedding(nn.Module):
+ def __init__(self, config: OlmoConfig, device=None):
+ super().__init__()
+ # BC: "rope_type" was originally "type"
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+ else:
+ self.rope_type = "default"
+ self.max_seq_len_cached = config.max_position_embeddings
+ self.original_max_seq_len = config.max_position_embeddings
+
+ self.config = config
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+ self.original_inv_freq = self.inv_freq
+
+ def _dynamic_frequency_update(self, position_ids, device):
+ """
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
+ 1 - growing beyond the cached sequence length (allow scaling)
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+ """
+ seq_len = torch.max(position_ids) + 1
+ if seq_len > self.max_seq_len_cached: # growth
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
+ self.max_seq_len_cached = seq_len
+
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
+ # This .to() is needed if the model has been moved to a device after being initialized (because
+ # the buffer is automatically moved, but not the original copy)
+ self.original_inv_freq = self.original_inv_freq.to(device)
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+ self.max_seq_len_cached = self.original_max_seq_len
+
+ @torch.no_grad()
+ def forward(self, x, position_ids):
+ if "dynamic" in self.rope_type:
+ self._dynamic_frequency_update(position_ids, device=x.device)
+
+ # Core RoPE block
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+ position_ids_expanded = position_ids[:, None, :].float()
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+ device_type = x.device.type
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False):
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos()
+ sin = emb.sin()
+
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+ cos = cos * self.attention_scaling
+ sin = sin * self.attention_scaling
+
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+OLMO_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`OlmoConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+ "The bare Olmo Model outputting raw hidden-states without any specific head on top.",
+ OLMO_START_DOCSTRING,
+)
+class OlmoPreTrainedModel(PreTrainedModel):
+ config_class = OlmoConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["OlmoDecoderLayer"]
+ _skip_keys_device_placement = ["past_key_values"]
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+ _supports_flex_attn = True
+ _supports_cache_class = True
+ _supports_quantized_cache = True
+ _supports_static_cache = True
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+OLMO_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ Two formats are allowed:
+ - a [`~cache_utils.Cache`] instance, see our
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+ cache format.
+
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+ legacy cache format will be returned.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+ the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+ "The bare Olmo Model outputting raw hidden-states without any specific head on top.",
+ OLMO_START_DOCSTRING,
+)
+class OlmoModel(OlmoPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OlmoDecoderLayer`]
+
+ Args:
+ config: OlmoConfig
+ """
+
+ def __init__(self, config: OlmoConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList(
+ [OlmoDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+ )
+ self.norm = OlmoLayerNorm(config.hidden_size)
+ self.rotary_emb = OlmoRotaryEmbedding(config=config)
+ self.gradient_checkpointing = False
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ @add_start_docstrings_to_model_forward(OLMO_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+ if self.gradient_checkpointing and self.training and use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+ )
+ use_cache = False
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+
+ if use_cache and past_key_values is None:
+ past_key_values = DynamicCache()
+
+ if cache_position is None:
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ cache_position = torch.arange(
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+ )
+
+ if position_ids is None:
+ position_ids = cache_position.unsqueeze(0)
+
+ causal_mask = self._update_causal_mask(
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+ )
+
+ hidden_states = inputs_embeds
+
+ # create position embeddings to be shared across the decoder layers
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ causal_mask,
+ position_ids,
+ past_key_values,
+ output_attentions,
+ use_cache,
+ cache_position,
+ position_embeddings,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=causal_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ position_embeddings=position_embeddings,
+ **flash_attn_kwargs,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ output = BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=past_key_values if use_cache else None,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+ return output if return_dict else output.to_tuple()
+
+ def _update_causal_mask(
+ self,
+ attention_mask: torch.Tensor,
+ input_tensor: torch.Tensor,
+ cache_position: torch.Tensor,
+ past_key_values: Cache,
+ output_attentions: bool,
+ ):
+ if self.config._attn_implementation == "flash_attention_2":
+ if attention_mask is not None and (attention_mask == 0.0).any():
+ return attention_mask
+ return None
+
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+ # to infer the attention mask.
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ using_static_cache = isinstance(past_key_values, StaticCache)
+
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+ if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
+ attention_mask,
+ inputs_embeds=input_tensor,
+ past_key_values_length=past_seen_tokens,
+ is_training=self.training,
+ ):
+ return None
+
+ dtype, device = input_tensor.dtype, input_tensor.device
+ sequence_length = input_tensor.shape[1]
+ if using_static_cache:
+ target_length = past_key_values.get_max_cache_shape()
+ else:
+ target_length = (
+ attention_mask.shape[-1]
+ if isinstance(attention_mask, torch.Tensor)
+ else past_seen_tokens + sequence_length + 1
+ )
+
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask,
+ sequence_length=sequence_length,
+ target_length=target_length,
+ dtype=dtype,
+ device=device,
+ cache_position=cache_position,
+ batch_size=input_tensor.shape[0],
+ )
+
+ if (
+ self.config._attn_implementation == "sdpa"
+ and attention_mask is not None
+ and attention_mask.device.type == "cuda"
+ and not output_attentions
+ ):
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+ # Details: https://github.com/pytorch/pytorch/issues/110213
+ min_dtype = torch.finfo(dtype).min
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+ return causal_mask
+
+ @staticmethod
+ def _prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask: torch.Tensor,
+ sequence_length: int,
+ target_length: int,
+ dtype: torch.dtype,
+ device: torch.device,
+ cache_position: torch.Tensor,
+ batch_size: int,
+ **kwargs,
+ ):
+ """
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+ Args:
+ attention_mask (`torch.Tensor`):
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+ `(batch_size, 1, query_length, key_value_length)`.
+ sequence_length (`int`):
+ The sequence length being processed.
+ target_length (`int`):
+ The target length: when generating with static cache, the mask should be as long as the static cache,
+ to account for the 0 padding, the part of the cache that is not filled yet.
+ dtype (`torch.dtype`):
+ The dtype to use for the 4D attention mask.
+ device (`torch.device`):
+ The device to plcae the 4D attention mask on.
+ cache_position (`torch.Tensor`):
+ Indices depicting the position of the input sequence tokens in the sequence.
+ batch_size (`torch.Tensor`):
+ Batch size.
+ """
+ if attention_mask is not None and attention_mask.dim() == 4:
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+ causal_mask = attention_mask
+ else:
+ min_dtype = torch.finfo(dtype).min
+ causal_mask = torch.full(
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+ )
+ if sequence_length != 1:
+ causal_mask = torch.triu(causal_mask, diagonal=1)
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+ if attention_mask is not None:
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
+ mask_length = attention_mask.shape[-1]
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+ padding_mask = padding_mask == 0
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+ padding_mask, min_dtype
+ )
+
+ return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class OlmoForCausalLM(OlmoPreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["lm_head.weight"]
+ _tp_plan = {"lm_head": "colwise_rep"}
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = OlmoModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @add_start_docstrings_to_model_forward(OLMO_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ num_logits_to_keep: int = 0,
+ **kwargs: Unpack[KwargsForCausalLM],
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ num_logits_to_keep (`int`, *optional*):
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, OlmoForCausalLM
+
+ >>> model = OlmoForCausalLM.from_pretrained("meta-olmo/Olmo-2-7b-hf")
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-olmo/Olmo-2-7b-hf")
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ cache_position=cache_position,
+ **kwargs,
+ )
+
+ hidden_states = outputs[0]
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+ logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/olmo/modular_olmo.py b/.venv/lib/python3.11/site-packages/transformers/models/olmo/modular_olmo.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a43e6f9c75d05a9e22fcf6ca59a32ea1bd61411
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/olmo/modular_olmo.py
@@ -0,0 +1,126 @@
+from typing import Callable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from ...cache_utils import Cache
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...utils import logging
+from ..llama.modeling_llama import (
+ LlamaAttention,
+ LlamaDecoderLayer,
+ LlamaForCausalLM,
+ LlamaMLP,
+ LlamaModel,
+ apply_rotary_pos_emb,
+ eager_attention_forward,
+)
+from .configuration_olmo import OlmoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class OlmoLayerNorm(nn.Module):
+ """LayerNorm but with no learnable weight or bias."""
+
+ def __init__(self, hidden_size: int) -> None:
+ super().__init__()
+ self.normalized_shape = (hidden_size,)
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ orig_dtype = hidden_states.dtype
+ return F.layer_norm(hidden_states.to(dtype=torch.float32), self.normalized_shape, None, None, eps=1e-5).to(
+ orig_dtype
+ )
+
+
+class OlmoMLP(LlamaMLP):
+ def __init__(self, config):
+ super().__init__(config)
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+
+class OlmoAttention(LlamaAttention):
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+ attention_mask: Optional[torch.Tensor],
+ past_key_value: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ input_shape = hidden_states.shape[:-1]
+ hidden_shape = (*input_shape, -1, self.head_dim)
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ if self.config.clip_qkv is not None:
+ query_states.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+ key_states.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+ value_states.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+
+ query_states = query_states.view(hidden_shape).transpose(1, 2)
+ key_states = key_states.view(hidden_shape).transpose(1, 2)
+ value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+ cos, sin = position_embeddings
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ attention_interface: Callable = eager_attention_forward
+ if self.config._attn_implementation != "eager":
+ if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+ logger.warning_once(
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ else:
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+ attn_output, attn_weights = attention_interface(
+ self,
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ dropout=0.0 if not self.training else self.attention_dropout,
+ scaling=self.scaling,
+ **kwargs,
+ )
+
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+ attn_output = self.o_proj(attn_output)
+ return attn_output, attn_weights
+
+
+class OlmoDecoderLayer(LlamaDecoderLayer):
+ def __init__(self, config: OlmoConfig, layer_idx: int):
+ super().__init__(config, layer_idx)
+ self.input_layernorm = OlmoLayerNorm(config.hidden_size)
+ self.post_attention_layernorm = OlmoLayerNorm(config.hidden_size)
+ self.self_attn = OlmoAttention(config=config, layer_idx=layer_idx)
+
+
+class OlmoModel(LlamaModel):
+ def __init__(self, config: OlmoConfig):
+ super().__init__(config)
+ self.layers = nn.ModuleList(
+ [OlmoDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+ )
+ self.norm = OlmoLayerNorm(config.hidden_size)
+
+
+class OlmoForCausalLM(LlamaForCausalLM):
+ pass
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1fac5b679dc5ca527dcc0a6a45b9a873daa3dc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_rt_detr import *
+ from .configuration_rt_detr_resnet import *
+ from .image_processing_rt_detr import *
+ from .image_processing_rt_detr_fast import *
+ from .modeling_rt_detr import *
+ from .modeling_rt_detr_resnet import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5dc4e81d8c8d65d0499f2c72b39425963510537f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..830627a84bd61a1908a69630855062f6079c9651
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr_resnet.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr_resnet.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff09022995bd0268293c4b6f91ad79791238bea7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr_resnet.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fc56daa248e7f93ad1cb0130b855079af7fb97c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr_fast.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr_fast.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b54621ac3fcf1965bd2048e85d26a2ebf76364b4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr_fast.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/modeling_rt_detr_resnet.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/modeling_rt_detr_resnet.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebd552dd58c6aa50f6eb9ebf1e785ae69a02f8f5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/modeling_rt_detr_resnet.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/modular_rt_detr.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/modular_rt_detr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36a8c0141dd0814438ee43b00a822226d43fd198
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/__pycache__/modular_rt_detr.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/configuration_rt_detr.py b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/configuration_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dc8ea550ad9e40d70da5049071d2f5dbcac2ab0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/configuration_rt_detr.py
@@ -0,0 +1,364 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RT-DETR model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+from .configuration_rt_detr_resnet import RTDetrResNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class RTDetrConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`RTDetrModel`]. It is used to instantiate a
+ RT-DETR model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the RT-DETR
+ [checkpoing/todo](https://huggingface.co/checkpoing/todo) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ initializer_range (`float`, *optional*, defaults to 0.01):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ initializer_bias_prior_prob (`float`, *optional*):
+ The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
+ If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+ The epsilon used by the layer normalization layers.
+ batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+ The epsilon used by the batch normalization layers.
+ backbone_config (`Dict`, *optional*, defaults to `RTDetrResNetConfig()`):
+ The configuration of the backbone model.
+ backbone (`str`, *optional*):
+ Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+ will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+ is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+ use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+ Whether to use pretrained weights for the backbone.
+ use_timm_backbone (`bool`, *optional*, defaults to `False`):
+ Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+ library.
+ freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
+ Whether to freeze the batch normalization layers in the backbone.
+ backbone_kwargs (`dict`, *optional*):
+ Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+ e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+ encoder_hidden_dim (`int`, *optional*, defaults to 256):
+ Dimension of the layers in hybrid encoder.
+ encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
+ Multi level features input for encoder.
+ feat_strides (`List[int]`, *optional*, defaults to `[8, 16, 32]`):
+ Strides used in each feature map.
+ encoder_layers (`int`, *optional*, defaults to 1):
+ Total of layers to be used by the encoder.
+ encoder_ffn_dim (`int`, *optional*, defaults to 1024):
+ Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+ encoder_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ dropout (`float`, *optional*, defaults to 0.0):
+ The ratio for all dropout layers.
+ activation_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for activations inside the fully connected layer.
+ encode_proj_layers (`List[int]`, *optional*, defaults to `[2]`):
+ Indexes of the projected layers to be used in the encoder.
+ positional_encoding_temperature (`int`, *optional*, defaults to 10000):
+ The temperature parameter used to create the positional encodings.
+ encoder_activation_function (`str`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ activation_function (`str`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the general layer. If string, `"gelu"`,
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ eval_size (`Tuple[int, int]`, *optional*):
+ Height and width used to computes the effective height and width of the position embeddings after taking
+ into account the stride.
+ normalize_before (`bool`, *optional*, defaults to `False`):
+ Determine whether to apply layer normalization in the transformer encoder layer before self-attention and
+ feed-forward modules.
+ hidden_expansion (`float`, *optional*, defaults to 1.0):
+ Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer.
+ d_model (`int`, *optional*, defaults to 256):
+ Dimension of the layers exclude hybrid encoder.
+ num_queries (`int`, *optional*, defaults to 300):
+ Number of object queries.
+ decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`):
+ Multi level features dimension for decoder
+ decoder_ffn_dim (`int`, *optional*, defaults to 1024):
+ Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+ num_feature_levels (`int`, *optional*, defaults to 3):
+ The number of input feature levels.
+ decoder_n_points (`int`, *optional*, defaults to 4):
+ The number of sampled keys in each feature level for each attention head in the decoder.
+ decoder_layers (`int`, *optional*, defaults to 6):
+ Number of decoder layers.
+ decoder_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer decoder.
+ decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
+ The non-linear activation function (function or string) in the decoder. If string, `"gelu"`,
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ num_denoising (`int`, *optional*, defaults to 100):
+ The total number of denoising tasks or queries to be used for contrastive denoising.
+ label_noise_ratio (`float`, *optional*, defaults to 0.5):
+ The fraction of denoising labels to which random noise should be added.
+ box_noise_scale (`float`, *optional*, defaults to 1.0):
+ Scale or magnitude of noise to be added to the bounding boxes.
+ learn_initial_query (`bool`, *optional*, defaults to `False`):
+ Indicates whether the initial query embeddings for the decoder should be learned during training
+ anchor_image_size (`Tuple[int, int]`, *optional*):
+ Height and width of the input image used during evaluation to generate the bounding box anchors. If None, automatic generate anchor is applied.
+ disable_custom_kernels (`bool`, *optional*, defaults to `True`):
+ Whether to disable custom kernels.
+ with_box_refine (`bool`, *optional*, defaults to `True`):
+ Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+ based on the predictions from the previous layer.
+ is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+ Whether the architecture has an encoder decoder structure.
+ matcher_alpha (`float`, *optional*, defaults to 0.25):
+ Parameter alpha used by the Hungarian Matcher.
+ matcher_gamma (`float`, *optional*, defaults to 2.0):
+ Parameter gamma used by the Hungarian Matcher.
+ matcher_class_cost (`float`, *optional*, defaults to 2.0):
+ The relative weight of the class loss used by the Hungarian Matcher.
+ matcher_bbox_cost (`float`, *optional*, defaults to 5.0):
+ The relative weight of the bounding box loss used by the Hungarian Matcher.
+ matcher_giou_cost (`float`, *optional*, defaults to 2.0):
+ The relative weight of the giou loss of used by the Hungarian Matcher.
+ use_focal_loss (`bool`, *optional*, defaults to `True`):
+ Parameter informing if focal focal should be used.
+ auxiliary_loss (`bool`, *optional*, defaults to `True`):
+ Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+ focal_loss_alpha (`float`, *optional*, defaults to 0.75):
+ Parameter alpha used to compute the focal loss.
+ focal_loss_gamma (`float`, *optional*, defaults to 2.0):
+ Parameter gamma used to compute the focal loss.
+ weight_loss_vfl (`float`, *optional*, defaults to 1.0):
+ Relative weight of the varifocal loss in the object detection loss.
+ weight_loss_bbox (`float`, *optional*, defaults to 5.0):
+ Relative weight of the L1 bounding box loss in the object detection loss.
+ weight_loss_giou (`float`, *optional*, defaults to 2.0):
+ Relative weight of the generalized IoU loss in the object detection loss.
+ eos_coefficient (`float`, *optional*, defaults to 0.0001):
+ Relative classification weight of the 'no-object' class in the object detection loss.
+
+ Examples:
+
+ ```python
+ >>> from transformers import RTDetrConfig, RTDetrModel
+
+ >>> # Initializing a RT-DETR configuration
+ >>> configuration = RTDetrConfig()
+
+ >>> # Initializing a model (with random weights) from the configuration
+ >>> model = RTDetrModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "rt_detr"
+ layer_types = ["basic", "bottleneck"]
+ attribute_map = {
+ "hidden_size": "d_model",
+ "num_attention_heads": "encoder_attention_heads",
+ }
+
+ def __init__(
+ self,
+ initializer_range=0.01,
+ initializer_bias_prior_prob=None,
+ layer_norm_eps=1e-5,
+ batch_norm_eps=1e-5,
+ # backbone
+ backbone_config=None,
+ backbone=None,
+ use_pretrained_backbone=False,
+ use_timm_backbone=False,
+ freeze_backbone_batch_norms=True,
+ backbone_kwargs=None,
+ # encoder HybridEncoder
+ encoder_hidden_dim=256,
+ encoder_in_channels=[512, 1024, 2048],
+ feat_strides=[8, 16, 32],
+ encoder_layers=1,
+ encoder_ffn_dim=1024,
+ encoder_attention_heads=8,
+ dropout=0.0,
+ activation_dropout=0.0,
+ encode_proj_layers=[2],
+ positional_encoding_temperature=10000,
+ encoder_activation_function="gelu",
+ activation_function="silu",
+ eval_size=None,
+ normalize_before=False,
+ hidden_expansion=1.0,
+ # decoder RTDetrTransformer
+ d_model=256,
+ num_queries=300,
+ decoder_in_channels=[256, 256, 256],
+ decoder_ffn_dim=1024,
+ num_feature_levels=3,
+ decoder_n_points=4,
+ decoder_layers=6,
+ decoder_attention_heads=8,
+ decoder_activation_function="relu",
+ attention_dropout=0.0,
+ num_denoising=100,
+ label_noise_ratio=0.5,
+ box_noise_scale=1.0,
+ learn_initial_query=False,
+ anchor_image_size=None,
+ disable_custom_kernels=True,
+ with_box_refine=True,
+ is_encoder_decoder=True,
+ # Loss
+ matcher_alpha=0.25,
+ matcher_gamma=2.0,
+ matcher_class_cost=2.0,
+ matcher_bbox_cost=5.0,
+ matcher_giou_cost=2.0,
+ use_focal_loss=True,
+ auxiliary_loss=True,
+ focal_loss_alpha=0.75,
+ focal_loss_gamma=2.0,
+ weight_loss_vfl=1.0,
+ weight_loss_bbox=5.0,
+ weight_loss_giou=2.0,
+ eos_coefficient=1e-4,
+ **kwargs,
+ ):
+ self.initializer_range = initializer_range
+ self.initializer_bias_prior_prob = initializer_bias_prior_prob
+ self.layer_norm_eps = layer_norm_eps
+ self.batch_norm_eps = batch_norm_eps
+ # backbone
+ if backbone_config is None and backbone is None:
+ logger.info(
+ "`backbone_config` and `backbone` are `None`. Initializing the config with the default `RTDetr-ResNet` backbone."
+ )
+ backbone_config = RTDetrResNetConfig(
+ num_channels=3,
+ embedding_size=64,
+ hidden_sizes=[256, 512, 1024, 2048],
+ depths=[3, 4, 6, 3],
+ layer_type="bottleneck",
+ hidden_act="relu",
+ downsample_in_first_stage=False,
+ downsample_in_bottleneck=False,
+ out_features=None,
+ out_indices=[2, 3, 4],
+ )
+ elif isinstance(backbone_config, dict):
+ backbone_model_type = backbone_config.pop("model_type")
+ config_class = CONFIG_MAPPING[backbone_model_type]
+ backbone_config = config_class.from_dict(backbone_config)
+
+ verify_backbone_config_arguments(
+ use_timm_backbone=use_timm_backbone,
+ use_pretrained_backbone=use_pretrained_backbone,
+ backbone=backbone,
+ backbone_config=backbone_config,
+ backbone_kwargs=backbone_kwargs,
+ )
+
+ self.backbone_config = backbone_config
+ self.backbone = backbone
+ self.use_pretrained_backbone = use_pretrained_backbone
+ self.use_timm_backbone = use_timm_backbone
+ self.freeze_backbone_batch_norms = freeze_backbone_batch_norms
+ self.backbone_kwargs = backbone_kwargs
+ # encoder
+ self.encoder_hidden_dim = encoder_hidden_dim
+ self.encoder_in_channels = encoder_in_channels
+ self.feat_strides = feat_strides
+ self.encoder_attention_heads = encoder_attention_heads
+ self.encoder_ffn_dim = encoder_ffn_dim
+ self.dropout = dropout
+ self.activation_dropout = activation_dropout
+ self.encode_proj_layers = encode_proj_layers
+ self.encoder_layers = encoder_layers
+ self.positional_encoding_temperature = positional_encoding_temperature
+ self.eval_size = eval_size
+ self.normalize_before = normalize_before
+ self.encoder_activation_function = encoder_activation_function
+ self.activation_function = activation_function
+ self.hidden_expansion = hidden_expansion
+ # decoder
+ self.d_model = d_model
+ self.num_queries = num_queries
+ self.decoder_ffn_dim = decoder_ffn_dim
+ self.decoder_in_channels = decoder_in_channels
+ self.num_feature_levels = num_feature_levels
+ self.decoder_n_points = decoder_n_points
+ self.decoder_layers = decoder_layers
+ self.decoder_attention_heads = decoder_attention_heads
+ self.decoder_activation_function = decoder_activation_function
+ self.attention_dropout = attention_dropout
+ self.num_denoising = num_denoising
+ self.label_noise_ratio = label_noise_ratio
+ self.box_noise_scale = box_noise_scale
+ self.learn_initial_query = learn_initial_query
+ self.anchor_image_size = anchor_image_size
+ self.auxiliary_loss = auxiliary_loss
+ self.disable_custom_kernels = disable_custom_kernels
+ self.with_box_refine = with_box_refine
+ # Loss
+ self.matcher_alpha = matcher_alpha
+ self.matcher_gamma = matcher_gamma
+ self.matcher_class_cost = matcher_class_cost
+ self.matcher_bbox_cost = matcher_bbox_cost
+ self.matcher_giou_cost = matcher_giou_cost
+ self.use_focal_loss = use_focal_loss
+ self.focal_loss_alpha = focal_loss_alpha
+ self.focal_loss_gamma = focal_loss_gamma
+ self.weight_loss_vfl = weight_loss_vfl
+ self.weight_loss_bbox = weight_loss_bbox
+ self.weight_loss_giou = weight_loss_giou
+ self.eos_coefficient = eos_coefficient
+ super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+ @property
+ def num_attention_heads(self) -> int:
+ return self.encoder_attention_heads
+
+ @property
+ def hidden_size(self) -> int:
+ return self.d_model
+
+ @classmethod
+ def from_backbone_configs(cls, backbone_config: PretrainedConfig, **kwargs):
+ """Instantiate a [`RTDetrConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model
+ configuration.
+
+ Args:
+ backbone_config ([`PretrainedConfig`]):
+ The backbone configuration.
+
+ Returns:
+ [`RTDetrConfig`]: An instance of a configuration object
+ """
+ return cls(
+ backbone_config=backbone_config,
+ **kwargs,
+ )
+
+
+__all__ = ["RTDetrConfig"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e74cd2f1819cf470f50b2c151e4e8406b7839c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RT-DETR ResNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class RTDetrResNetConfig(BackboneConfigMixin, PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`RTDetrResnetBackbone`]. It is used to instantiate an
+ ResNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the ResNet
+ [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ embedding_size (`int`, *optional*, defaults to 64):
+ Dimensionality (hidden size) for the embedding layer.
+ hidden_sizes (`List[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+ Dimensionality (hidden size) at each stage.
+ depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+ Depth (number of layers) for each stage.
+ layer_type (`str`, *optional*, defaults to `"bottleneck"`):
+ The layer to use, it can be either `"basic"` (used for smaller models, like resnet-18 or resnet-34) or
+ `"bottleneck"` (used for larger models like resnet-50 and above).
+ hidden_act (`str`, *optional*, defaults to `"relu"`):
+ The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+ are supported.
+ downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
+ If `True`, the first stage will downsample the inputs using a `stride` of 2.
+ downsample_in_bottleneck (`bool`, *optional*, defaults to `False`):
+ If `True`, the first conv 1x1 in ResNetBottleNeckLayer will downsample the inputs using a `stride` of 2.
+ out_features (`List[str]`, *optional*):
+ If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+ (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+ corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+ same order as defined in the `stage_names` attribute.
+ out_indices (`List[int]`, *optional*):
+ If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+ many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+ If unset and `out_features` is unset, will default to the last stage. Must be in the
+ same order as defined in the `stage_names` attribute.
+
+ Example:
+ ```python
+ >>> from transformers import RTDetrResNetConfig, RTDetrResnetBackbone
+
+ >>> # Initializing a ResNet resnet-50 style configuration
+ >>> configuration = RTDetrResNetConfig()
+
+ >>> # Initializing a model (with random weights) from the resnet-50 style configuration
+ >>> model = RTDetrResnetBackbone(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```
+ """
+
+ model_type = "rt_detr_resnet"
+ layer_types = ["basic", "bottleneck"]
+
+ def __init__(
+ self,
+ num_channels=3,
+ embedding_size=64,
+ hidden_sizes=[256, 512, 1024, 2048],
+ depths=[3, 4, 6, 3],
+ layer_type="bottleneck",
+ hidden_act="relu",
+ downsample_in_first_stage=False,
+ downsample_in_bottleneck=False,
+ out_features=None,
+ out_indices=None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ if layer_type not in self.layer_types:
+ raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
+ self.num_channels = num_channels
+ self.embedding_size = embedding_size
+ self.hidden_sizes = hidden_sizes
+ self.depths = depths
+ self.layer_type = layer_type
+ self.hidden_act = hidden_act
+ self.downsample_in_first_stage = downsample_in_first_stage
+ self.downsample_in_bottleneck = downsample_in_bottleneck
+ self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+ self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+ out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+ )
+
+
+__all__ = ["RTDetrResNetConfig"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/image_processing_rt_detr.py b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/image_processing_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3e75972cc655c7387c3d731f7815ae1764a913d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/image_processing_rt_detr.py
@@ -0,0 +1,1102 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for RT-DETR."""
+
+import pathlib
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
+ PaddingMode,
+ center_to_corners_format,
+ corners_to_center_format,
+ pad,
+ rescale,
+ resize,
+ to_channel_dimension_format,
+)
+from ...image_utils import (
+ IMAGENET_DEFAULT_MEAN,
+ IMAGENET_DEFAULT_STD,
+ AnnotationFormat,
+ AnnotationType,
+ ChannelDimension,
+ ImageInput,
+ PILImageResampling,
+ get_image_size,
+ infer_channel_dimension_format,
+ is_scaled_image,
+ make_list_of_images,
+ to_numpy_array,
+ valid_images,
+ validate_annotations,
+ validate_preprocess_arguments,
+)
+from ...utils import (
+ filter_out_non_signature_kwargs,
+ is_flax_available,
+ is_jax_tensor,
+ is_tf_available,
+ is_tf_tensor,
+ is_torch_available,
+ is_torch_tensor,
+ logging,
+ requires_backends,
+)
+from ...utils.generic import TensorType
+
+
+if is_torch_available():
+ import torch
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
+ """
+ Computes the output image size given the input image size and the desired output size.
+
+ Args:
+ image_size (`Tuple[int, int]`):
+ The input image size.
+ size (`int`):
+ The desired output size.
+ max_size (`int`, *optional*):
+ The maximum allowed output size.
+ """
+ height, width = image_size
+ raw_size = None
+ if max_size is not None:
+ min_original_size = float(min((height, width)))
+ max_original_size = float(max((height, width)))
+ if max_original_size / min_original_size * size > max_size:
+ raw_size = max_size * min_original_size / max_original_size
+ size = int(round(raw_size))
+
+ if (height <= width and height == size) or (width <= height and width == size):
+ oh, ow = height, width
+ elif width < height:
+ ow = size
+ if max_size is not None and raw_size is not None:
+ oh = int(raw_size * height / width)
+ else:
+ oh = int(size * height / width)
+ else:
+ oh = size
+ if max_size is not None and raw_size is not None:
+ ow = int(raw_size * width / height)
+ else:
+ ow = int(size * width / height)
+
+ return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
+def get_resize_output_image_size(
+ input_image: np.ndarray,
+ size: Union[int, Tuple[int, int], List[int]],
+ max_size: Optional[int] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+ """
+ Computes the output image size given the input image size and the desired output size. If the desired output size
+ is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+ image size is computed by keeping the aspect ratio of the input image size.
+
+ Args:
+ input_image (`np.ndarray`):
+ The image to resize.
+ size (`int` or `Tuple[int, int]` or `List[int]`):
+ The desired output size.
+ max_size (`int`, *optional*):
+ The maximum allowed output size.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+ """
+ image_size = get_image_size(input_image, input_data_format)
+ if isinstance(size, (list, tuple)):
+ return size
+
+ return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
+def get_image_size_for_max_height_width(
+ input_image: np.ndarray,
+ max_height: int,
+ max_width: int,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+ """
+ Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+ Important, even if image_height < max_height and image_width < max_width, the image will be resized
+ to at least one of the edges be equal to max_height or max_width.
+ For example:
+ - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+ - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+ Args:
+ input_image (`np.ndarray`):
+ The image to resize.
+ max_height (`int`):
+ The maximum allowed height.
+ max_width (`int`):
+ The maximum allowed width.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+ """
+ image_size = get_image_size(input_image, input_data_format)
+ height, width = image_size
+ height_scale = max_height / height
+ width_scale = max_width / width
+ min_scale = min(height_scale, width_scale)
+ new_height = int(height * min_scale)
+ new_width = int(width * min_scale)
+ return new_height, new_width
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
+def get_numpy_to_framework_fn(arr) -> Callable:
+ """
+ Returns a function that converts a numpy array to the framework of the input array.
+
+ Args:
+ arr (`np.ndarray`): The array to convert.
+ """
+ if isinstance(arr, np.ndarray):
+ return np.array
+ if is_tf_available() and is_tf_tensor(arr):
+ import tensorflow as tf
+
+ return tf.convert_to_tensor
+ if is_torch_available() and is_torch_tensor(arr):
+ import torch
+
+ return torch.tensor
+ if is_flax_available() and is_jax_tensor(arr):
+ import jax.numpy as jnp
+
+ return jnp.array
+ raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+ """
+ Squeezes an array, but only if the axis specified has dim 1.
+ """
+ if axis is None:
+ return arr.squeeze()
+
+ try:
+ return arr.squeeze(axis=axis)
+ except ValueError:
+ return arr
+
+
+# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
+def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+ image_height, image_width = image_size
+ norm_annotation = {}
+ for key, value in annotation.items():
+ if key == "boxes":
+ boxes = value
+ boxes = corners_to_center_format(boxes)
+ boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+ norm_annotation[key] = boxes
+ else:
+ norm_annotation[key] = value
+ return norm_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+ """
+ Return the maximum value across all indices of an iterable of values.
+ """
+ return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+ images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+ """
+ Get the maximum height and width across all images in a batch.
+ """
+ if input_data_format is None:
+ input_data_format = infer_channel_dimension_format(images[0])
+
+ if input_data_format == ChannelDimension.FIRST:
+ _, max_height, max_width = max_across_indices([img.shape for img in images])
+ elif input_data_format == ChannelDimension.LAST:
+ max_height, max_width, _ = max_across_indices([img.shape for img in images])
+ else:
+ raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+ return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+ image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+ """
+ Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+ Args:
+ image (`np.ndarray`):
+ Image to make the pixel mask for.
+ output_size (`Tuple[int, int]`):
+ Output size of the mask.
+ """
+ input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+ mask = np.zeros(output_size, dtype=np.int64)
+ mask[:input_height, :input_width] = 1
+ return mask
+
+
+def prepare_coco_detection_annotation(
+ image,
+ target,
+ return_segmentation_masks: bool = False,
+ input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+ """
+ Convert the target in COCO format into the format expected by RTDETR.
+ """
+ image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+ image_id = target["image_id"]
+ image_id = np.asarray([image_id], dtype=np.int64)
+
+ # Get all COCO annotations for the given image.
+ annotations = target["annotations"]
+ annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+ classes = [obj["category_id"] for obj in annotations]
+ classes = np.asarray(classes, dtype=np.int64)
+
+ # for conversion to coco api
+ area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+ iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
+
+ boxes = [obj["bbox"] for obj in annotations]
+ # guard against no boxes via resizing
+ boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+ boxes[:, 2:] += boxes[:, :2]
+ boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+ boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+ keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+ new_target = {}
+ new_target["image_id"] = image_id
+ new_target["class_labels"] = classes[keep]
+ new_target["boxes"] = boxes[keep]
+ new_target["area"] = area[keep]
+ new_target["iscrowd"] = iscrowd[keep]
+ new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+ if annotations and "keypoints" in annotations[0]:
+ keypoints = [obj["keypoints"] for obj in annotations]
+ # Converting the filtered keypoints list to a numpy array
+ keypoints = np.asarray(keypoints, dtype=np.float32)
+ # Apply the keep mask here to filter the relevant annotations
+ keypoints = keypoints[keep]
+ num_keypoints = keypoints.shape[0]
+ keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+ new_target["keypoints"] = keypoints
+
+ return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.resize_annotation
+def resize_annotation(
+ annotation: Dict[str, Any],
+ orig_size: Tuple[int, int],
+ target_size: Tuple[int, int],
+ threshold: float = 0.5,
+ resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+ """
+ Resizes an annotation to a target size.
+
+ Args:
+ annotation (`Dict[str, Any]`):
+ The annotation dictionary.
+ orig_size (`Tuple[int, int]`):
+ The original size of the input image.
+ target_size (`Tuple[int, int]`):
+ The target size of the image, as returned by the preprocessing `resize` step.
+ threshold (`float`, *optional*, defaults to 0.5):
+ The threshold used to binarize the segmentation masks.
+ resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+ The resampling filter to use when resizing the masks.
+ """
+ ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+ ratio_height, ratio_width = ratios
+
+ new_annotation = {}
+ new_annotation["size"] = target_size
+
+ for key, value in annotation.items():
+ if key == "boxes":
+ boxes = value
+ scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+ new_annotation["boxes"] = scaled_boxes
+ elif key == "area":
+ area = value
+ scaled_area = area * (ratio_width * ratio_height)
+ new_annotation["area"] = scaled_area
+ elif key == "masks":
+ masks = value[:, None]
+ masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+ masks = masks.astype(np.float32)
+ masks = masks[:, 0] > threshold
+ new_annotation["masks"] = masks
+ elif key == "size":
+ new_annotation["size"] = target_size
+ else:
+ new_annotation[key] = value
+
+ return new_annotation
+
+
+class RTDetrImageProcessor(BaseImageProcessor):
+ r"""
+ Constructs a RT-DETR image processor.
+
+ Args:
+ format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+ Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+ overridden by the `do_resize` parameter in the `preprocess` method.
+ size (`Dict[str, int]` *optional*, defaults to `{"height": 640, "width": 640}`):
+ Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+ in the `preprocess` method. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+ Resampling filter to use if resizing the image.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+ `do_rescale` parameter in the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+ `preprocess` method.
+ Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+ `preprocess` method.
+ do_normalize (`bool`, *optional*, defaults to `False`):
+ Whether to normalize the image.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+ Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+ channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+ Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+ for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ do_convert_annotations (`bool`, *optional*, defaults to `True`):
+ Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+ bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+ Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+ do_pad (`bool`, *optional*, defaults to `False`):
+ Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+ method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+ If `pad_size` is provided, the image will be padded to the specified dimensions.
+ Otherwise, the image will be padded to the maximum height and width of the batch.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+
+ model_input_names = ["pixel_values", "pixel_mask"]
+
+ def __init__(
+ self,
+ format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+ do_resize: bool = True,
+ size: Dict[str, int] = None,
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_normalize: bool = False,
+ image_mean: Union[float, List[float]] = None,
+ image_std: Union[float, List[float]] = None,
+ do_convert_annotations: bool = True,
+ do_pad: bool = False,
+ pad_size: Optional[Dict[str, int]] = None,
+ **kwargs,
+ ) -> None:
+ size = size if size is not None else {"height": 640, "width": 640}
+ size = get_size_dict(size, default_to_square=False)
+
+ if do_convert_annotations is None:
+ do_convert_annotations = do_normalize
+
+ super().__init__(**kwargs)
+ self.format = format
+ self.do_resize = do_resize
+ self.size = size
+ self.resample = resample
+ self.do_rescale = do_rescale
+ self.rescale_factor = rescale_factor
+ self.do_normalize = do_normalize
+ self.do_convert_annotations = do_convert_annotations
+ self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+ self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+ self.do_pad = do_pad
+ self.pad_size = pad_size
+
+ def prepare_annotation(
+ self,
+ image: np.ndarray,
+ target: Dict,
+ format: Optional[AnnotationFormat] = None,
+ return_segmentation_masks: bool = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> Dict:
+ """
+ Prepare an annotation for feeding into RTDETR model.
+ """
+ format = format if format is not None else self.format
+
+ if format == AnnotationFormat.COCO_DETECTION:
+ return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+ target = prepare_coco_detection_annotation(
+ image, target, return_segmentation_masks, input_data_format=input_data_format
+ )
+ else:
+ raise ValueError(f"Format {format} is not supported.")
+ return target
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
+ def resize(
+ self,
+ image: np.ndarray,
+ size: Dict[str, int],
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
+ data_format: Optional[ChannelDimension] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+ int, smaller edge of the image will be matched to this number.
+
+ Args:
+ image (`np.ndarray`):
+ Image to resize.
+ size (`Dict[str, int]`):
+ Size of the image's `(height, width)` dimensions after resizing. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+ Resampling filter to use if resizing the image.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
+ image is used.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred.
+ """
+ if "max_size" in kwargs:
+ logger.warning_once(
+ "The `max_size` parameter is deprecated and will be removed in v4.26. "
+ "Please specify in `size['longest_edge'] instead`.",
+ )
+ max_size = kwargs.pop("max_size")
+ else:
+ max_size = None
+ size = get_size_dict(size, max_size=max_size, default_to_square=False)
+ if "shortest_edge" in size and "longest_edge" in size:
+ new_size = get_resize_output_image_size(
+ image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+ )
+ elif "max_height" in size and "max_width" in size:
+ new_size = get_image_size_for_max_height_width(
+ image, size["max_height"], size["max_width"], input_data_format=input_data_format
+ )
+ elif "height" in size and "width" in size:
+ new_size = (size["height"], size["width"])
+ else:
+ raise ValueError(
+ "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+ f" {size.keys()}."
+ )
+ image = resize(
+ image,
+ size=new_size,
+ resample=resample,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ **kwargs,
+ )
+ return image
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
+ def resize_annotation(
+ self,
+ annotation,
+ orig_size,
+ size,
+ resample: PILImageResampling = PILImageResampling.NEAREST,
+ ) -> Dict:
+ """
+ Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+ to this number.
+ """
+ return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+ def rescale(
+ self,
+ image: np.ndarray,
+ rescale_factor: float,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> np.ndarray:
+ """
+ Rescale the image by the given factor. image = image * rescale_factor.
+
+ Args:
+ image (`np.ndarray`):
+ Image to rescale.
+ rescale_factor (`float`):
+ The value to use for rescaling.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
+ image is used. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ input_data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+ one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ """
+ return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
+ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+ """
+ Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+ `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
+ """
+ return normalize_annotation(annotation, image_size=image_size)
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
+ def _update_annotation_for_padded_image(
+ self,
+ annotation: Dict,
+ input_image_size: Tuple[int, int],
+ output_image_size: Tuple[int, int],
+ padding,
+ update_bboxes,
+ ) -> Dict:
+ """
+ Update the annotation for a padded image.
+ """
+ new_annotation = {}
+ new_annotation["size"] = output_image_size
+
+ for key, value in annotation.items():
+ if key == "masks":
+ masks = value
+ masks = pad(
+ masks,
+ padding,
+ mode=PaddingMode.CONSTANT,
+ constant_values=0,
+ input_data_format=ChannelDimension.FIRST,
+ )
+ masks = safe_squeeze(masks, 1)
+ new_annotation["masks"] = masks
+ elif key == "boxes" and update_bboxes:
+ boxes = value
+ boxes *= np.asarray(
+ [
+ input_image_size[1] / output_image_size[1],
+ input_image_size[0] / output_image_size[0],
+ input_image_size[1] / output_image_size[1],
+ input_image_size[0] / output_image_size[0],
+ ]
+ )
+ new_annotation["boxes"] = boxes
+ elif key == "size":
+ new_annotation["size"] = output_image_size
+ else:
+ new_annotation[key] = value
+ return new_annotation
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+ def _pad_image(
+ self,
+ image: np.ndarray,
+ output_size: Tuple[int, int],
+ annotation: Optional[Dict[str, Any]] = None,
+ constant_values: Union[float, Iterable[float]] = 0,
+ data_format: Optional[ChannelDimension] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ update_bboxes: bool = True,
+ ) -> np.ndarray:
+ """
+ Pad an image with zeros to the given size.
+ """
+ input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+ output_height, output_width = output_size
+
+ pad_bottom = output_height - input_height
+ pad_right = output_width - input_width
+ padding = ((0, pad_bottom), (0, pad_right))
+ padded_image = pad(
+ image,
+ padding,
+ mode=PaddingMode.CONSTANT,
+ constant_values=constant_values,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ )
+ if annotation is not None:
+ annotation = self._update_annotation_for_padded_image(
+ annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+ )
+ return padded_image, annotation
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+ def pad(
+ self,
+ images: List[np.ndarray],
+ annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+ constant_values: Union[float, Iterable[float]] = 0,
+ return_pixel_mask: bool = True,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ data_format: Optional[ChannelDimension] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ update_bboxes: bool = True,
+ pad_size: Optional[Dict[str, int]] = None,
+ ) -> BatchFeature:
+ """
+ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+ in the batch and optionally returns their corresponding pixel mask.
+
+ Args:
+ images (List[`np.ndarray`]):
+ Images to pad.
+ annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+ Annotations to transform according to the padding that is applied to the images.
+ constant_values (`float` or `Iterable[float]`, *optional*):
+ The value to use for the padding if `mode` is `"constant"`.
+ return_pixel_mask (`bool`, *optional*, defaults to `True`):
+ Whether to return a pixel mask.
+ return_tensors (`str` or `TensorType`, *optional*):
+ The type of tensors to return. Can be one of:
+ - Unset: Return a list of `np.ndarray`.
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format of the image. If not provided, it will be the same as the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred.
+ update_bboxes (`bool`, *optional*, defaults to `True`):
+ Whether to update the bounding boxes in the annotations to match the padded images. If the
+ bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+ format, the bounding boxes will not be updated.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+ pad_size = pad_size if pad_size is not None else self.pad_size
+ if pad_size is not None:
+ padded_size = (pad_size["height"], pad_size["width"])
+ else:
+ padded_size = get_max_height_width(images, input_data_format=input_data_format)
+
+ annotation_list = annotations if annotations is not None else [None] * len(images)
+ padded_images = []
+ padded_annotations = []
+ for image, annotation in zip(images, annotation_list):
+ padded_image, padded_annotation = self._pad_image(
+ image,
+ padded_size,
+ annotation,
+ constant_values=constant_values,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ update_bboxes=update_bboxes,
+ )
+ padded_images.append(padded_image)
+ padded_annotations.append(padded_annotation)
+
+ data = {"pixel_values": padded_images}
+
+ if return_pixel_mask:
+ masks = [
+ make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
+ for image in images
+ ]
+ data["pixel_mask"] = masks
+
+ encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+ if annotations is not None:
+ encoded_inputs["labels"] = [
+ BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+ ]
+
+ return encoded_inputs
+
+ @filter_out_non_signature_kwargs()
+ def preprocess(
+ self,
+ images: ImageInput,
+ annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+ return_segmentation_masks: bool = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ do_resize: Optional[bool] = None,
+ size: Optional[Dict[str, int]] = None,
+ resample=None, # PILImageResampling
+ do_rescale: Optional[bool] = None,
+ rescale_factor: Optional[Union[int, float]] = None,
+ do_normalize: Optional[bool] = None,
+ do_convert_annotations: Optional[bool] = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ do_pad: Optional[bool] = None,
+ format: Optional[Union[str, AnnotationFormat]] = None,
+ return_tensors: Optional[Union[TensorType, str]] = None,
+ data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ pad_size: Optional[Dict[str, int]] = None,
+ ) -> BatchFeature:
+ """
+ Preprocess an image or a batch of images so that it can be used by the model.
+
+ Args:
+ images (`ImageInput`):
+ Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+ from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+ annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+ List of annotations associated with the image or batch of images. If annotation is for object
+ detection, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+ dictionary. An image can have no annotations, in which case the list should be empty.
+ If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+ An image can have no segments, in which case the list should be empty.
+ - "file_name" (`str`): The file name of the image.
+ return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+ Whether to return segmentation masks.
+ masks_path (`str` or `pathlib.Path`, *optional*):
+ Path to the directory containing the segmentation masks.
+ do_resize (`bool`, *optional*, defaults to self.do_resize):
+ Whether to resize the image.
+ size (`Dict[str, int]`, *optional*, defaults to self.size):
+ Size of the image's `(height, width)` dimensions after resizing. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling`, *optional*, defaults to self.resample):
+ Resampling filter to use when resizing the image.
+ do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+ Whether to rescale the image.
+ rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+ Rescale factor to use when rescaling the image.
+ do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+ Whether to normalize the image.
+ do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+ Whether to convert the annotations to the format expected by the model. Converts the bounding
+ boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+ and in relative coordinates.
+ image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+ Mean to use when normalizing the image.
+ image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+ Standard deviation to use when normalizing the image.
+ do_pad (`bool`, *optional*, defaults to self.do_pad):
+ Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+ the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+ dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+ format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+ Format of the annotations.
+ return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+ Type of tensors to return. If `None`, will return the list of images.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+ do_resize = self.do_resize if do_resize is None else do_resize
+ size = self.size if size is None else size
+ size = get_size_dict(size=size, default_to_square=True)
+ resample = self.resample if resample is None else resample
+ do_rescale = self.do_rescale if do_rescale is None else do_rescale
+ rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+ do_normalize = self.do_normalize if do_normalize is None else do_normalize
+ image_mean = self.image_mean if image_mean is None else image_mean
+ image_std = self.image_std if image_std is None else image_std
+ do_convert_annotations = (
+ self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+ )
+ do_pad = self.do_pad if do_pad is None else do_pad
+ pad_size = self.pad_size if pad_size is None else pad_size
+ format = self.format if format is None else format
+
+ images = make_list_of_images(images)
+
+ if not valid_images(images):
+ raise ValueError(
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+ "torch.Tensor, tf.Tensor or jax.ndarray."
+ )
+
+ # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
+
+ validate_preprocess_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ )
+
+ if annotations is not None and isinstance(annotations, dict):
+ annotations = [annotations]
+
+ if annotations is not None and len(images) != len(annotations):
+ raise ValueError(
+ f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+ )
+
+ format = AnnotationFormat(format)
+ if annotations is not None:
+ validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+ images = make_list_of_images(images)
+ if not valid_images(images):
+ raise ValueError(
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+ "torch.Tensor, tf.Tensor or jax.ndarray."
+ )
+
+ # All transformations expect numpy arrays
+ images = [to_numpy_array(image) for image in images]
+
+ if do_rescale and is_scaled_image(images[0]):
+ logger.warning_once(
+ "It looks like you are trying to rescale already rescaled images. If the input"
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+ )
+
+ if input_data_format is None:
+ # We assume that all images have the same channel dimension format.
+ input_data_format = infer_channel_dimension_format(images[0])
+
+ # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+ if annotations is not None:
+ prepared_images = []
+ prepared_annotations = []
+ for image, target in zip(images, annotations):
+ target = self.prepare_annotation(
+ image,
+ target,
+ format,
+ return_segmentation_masks=return_segmentation_masks,
+ masks_path=masks_path,
+ input_data_format=input_data_format,
+ )
+ prepared_images.append(image)
+ prepared_annotations.append(target)
+ images = prepared_images
+ annotations = prepared_annotations
+ del prepared_images, prepared_annotations
+
+ # transformations
+ if do_resize:
+ if annotations is not None:
+ resized_images, resized_annotations = [], []
+ for image, target in zip(images, annotations):
+ orig_size = get_image_size(image, input_data_format)
+ resized_image = self.resize(
+ image, size=size, resample=resample, input_data_format=input_data_format
+ )
+ resized_annotation = self.resize_annotation(
+ target, orig_size, get_image_size(resized_image, input_data_format)
+ )
+ resized_images.append(resized_image)
+ resized_annotations.append(resized_annotation)
+ images = resized_images
+ annotations = resized_annotations
+ del resized_images, resized_annotations
+ else:
+ images = [
+ self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+ for image in images
+ ]
+
+ if do_rescale:
+ images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+ if do_normalize:
+ images = [
+ self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+ ]
+
+ if do_convert_annotations and annotations is not None:
+ annotations = [
+ self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+ for annotation, image in zip(annotations, images)
+ ]
+
+ if do_pad:
+ # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+ encoded_inputs = self.pad(
+ images,
+ annotations=annotations,
+ return_pixel_mask=True,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ update_bboxes=do_convert_annotations,
+ return_tensors=return_tensors,
+ pad_size=pad_size,
+ )
+ else:
+ images = [
+ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+ for image in images
+ ]
+ encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+ if annotations is not None:
+ encoded_inputs["labels"] = [
+ BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+ ]
+
+ return encoded_inputs
+
+ def post_process_object_detection(
+ self,
+ outputs,
+ threshold: float = 0.5,
+ target_sizes: Union[TensorType, List[Tuple]] = None,
+ use_focal_loss: bool = True,
+ ):
+ """
+ Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+ bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+ Args:
+ outputs ([`DetrObjectDetectionOutput`]):
+ Raw outputs of the model.
+ threshold (`float`, *optional*, defaults to 0.5):
+ Score threshold to keep object detection predictions.
+ target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+ Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+ `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+ use_focal_loss (`bool` defaults to `True`):
+ Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
+ to compute the scores of each detection, otherwise, a softmax function is used.
+
+ Returns:
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+ in the batch as predicted by the model.
+ """
+ requires_backends(self, ["torch"])
+ out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+ # convert from relative cxcywh to absolute xyxy
+ boxes = center_to_corners_format(out_bbox)
+ if target_sizes is not None:
+ if len(out_logits) != len(target_sizes):
+ raise ValueError(
+ "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+ )
+ if isinstance(target_sizes, List):
+ img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
+ else:
+ img_h, img_w = target_sizes.unbind(1)
+ scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+ boxes = boxes * scale_fct[:, None, :]
+
+ num_top_queries = out_logits.shape[1]
+ num_classes = out_logits.shape[2]
+
+ if use_focal_loss:
+ scores = torch.nn.functional.sigmoid(out_logits)
+ scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
+ labels = index % num_classes
+ index = index // num_classes
+ boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+ else:
+ scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+ scores, labels = scores.max(dim=-1)
+ if scores.shape[1] > num_top_queries:
+ scores, index = torch.topk(scores, num_top_queries, dim=-1)
+ labels = torch.gather(labels, dim=1, index=index)
+ boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+ results = []
+ for score, label, box in zip(scores, labels, boxes):
+ results.append(
+ {
+ "scores": score[score > threshold],
+ "labels": label[score > threshold],
+ "boxes": box[score > threshold],
+ }
+ )
+
+ return results
+
+
+__all__ = ["RTDetrImageProcessor"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..70ce29a211d224fc80288d1fbf5d7acc1707ef57
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -0,0 +1,778 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/rt_detr/modular_rt_detr.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_rt_detr.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import functools
+import pathlib
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+ BaseImageProcessorFast,
+ SizeDict,
+ get_image_size_for_max_height_width,
+ get_max_height_width,
+ safe_squeeze,
+)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...image_utils import (
+ IMAGENET_DEFAULT_MEAN,
+ IMAGENET_DEFAULT_STD,
+ AnnotationFormat,
+ AnnotationType,
+ ChannelDimension,
+ ImageInput,
+ ImageType,
+ PILImageResampling,
+ get_image_size,
+ get_image_type,
+ infer_channel_dimension_format,
+ make_list_of_images,
+ validate_annotations,
+)
+from ...utils import (
+ TensorType,
+ filter_out_non_signature_kwargs,
+ is_torch_available,
+ is_torchvision_available,
+ is_torchvision_v2_available,
+ is_vision_available,
+ requires_backends,
+)
+from .image_processing_rt_detr import get_size_with_aspect_ratio
+
+
+if is_torch_available():
+ import torch
+
+if is_vision_available():
+ from ...image_utils import pil_torch_interpolation_mapping
+
+
+if is_torchvision_v2_available():
+ from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+ from torchvision.transforms import functional as F
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+def prepare_coco_detection_annotation(
+ image,
+ target,
+ return_segmentation_masks: bool = False,
+ input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+ """
+ Convert the target in COCO format into the format expected by RT-DETR.
+ """
+ image_height, image_width = image.size()[-2:]
+
+ image_id = target["image_id"]
+ image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+ # Get all COCO annotations for the given image.
+ annotations = target["annotations"]
+ classes = []
+ area = []
+ boxes = []
+ keypoints = []
+ for obj in annotations:
+ if "iscrowd" not in obj or obj["iscrowd"] == 0:
+ classes.append(obj["category_id"])
+ area.append(obj["area"])
+ boxes.append(obj["bbox"])
+ if "keypoints" in obj:
+ keypoints.append(obj["keypoints"])
+
+ classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+ area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+ iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+ # guard against no boxes via resizing
+ boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+ boxes[:, 2:] += boxes[:, :2]
+ boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+ boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+ keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+ new_target = {
+ "image_id": image_id,
+ "class_labels": classes[keep],
+ "boxes": boxes[keep],
+ "area": area[keep],
+ "iscrowd": iscrowd[keep],
+ "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+ }
+
+ if keypoints:
+ keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+ # Apply the keep mask here to filter the relevant annotations
+ keypoints = keypoints[keep]
+ num_keypoints = keypoints.shape[0]
+ keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+ new_target["keypoints"] = keypoints
+
+ return new_target
+
+
+class RTDetrImageProcessorFast(BaseImageProcessorFast):
+ r"""
+ Constructs a fast RTDetr image processor.
+
+ Args:
+ format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+ Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
+ overridden by the `do_resize` parameter in the `preprocess` method.
+ size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+ Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+ in the `preprocess` method. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+ Resampling filter to use if resizing the image.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+ `do_rescale` parameter in the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+ `preprocess` method.
+ do_normalize (`bool`, *optional*, defaults to `False`):
+ Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+ `preprocess` method.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+ Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+ channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+ Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+ for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ do_convert_annotations (`bool`, *optional*, defaults to `True`):
+ Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
+ bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+ Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+ do_pad (`bool`, *optional*, defaults to `False`):
+ Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+ method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+ If `pad_size` is provided, the image will be padded to the specified dimensions.
+ Otherwise, the image will be padded to the maximum height and width of the batch.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+
+ model_input_names = ["pixel_values", "pixel_mask"]
+
+ def __init__(
+ self,
+ format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+ do_resize: bool = True,
+ size: Dict[str, int] = None,
+ resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_normalize: bool = False,
+ image_mean: Union[float, List[float]] = None,
+ image_std: Union[float, List[float]] = None,
+ do_convert_annotations: bool = True,
+ do_pad: bool = False,
+ pad_size: Optional[Dict[str, int]] = None,
+ **kwargs,
+ ) -> None:
+ size = size if size is not None else {"height": 640, "width": 640}
+ size = get_size_dict(size, default_to_square=False)
+
+ if do_convert_annotations is None:
+ do_convert_annotations = do_normalize
+
+ super().__init__(**kwargs)
+ self.format = format
+ self.do_resize = do_resize
+ self.size = size
+ self.resample = resample
+ self.do_rescale = do_rescale
+ self.rescale_factor = rescale_factor
+ self.do_normalize = do_normalize
+ self.do_convert_annotations = do_convert_annotations
+ self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+ self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+ self.do_pad = do_pad
+ self.pad_size = pad_size
+
+ def prepare_annotation(
+ self,
+ image: torch.Tensor,
+ target: Dict,
+ format: Optional[AnnotationFormat] = None,
+ return_segmentation_masks: bool = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> Dict:
+ """
+ Prepare an annotation for feeding into RT_DETR model.
+ """
+ format = format if format is not None else self.format
+
+ if format == AnnotationFormat.COCO_DETECTION:
+ return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+ target = prepare_coco_detection_annotation(
+ image, target, return_segmentation_masks, input_data_format=input_data_format
+ )
+ else:
+ raise ValueError(f"Format {format} is not supported.")
+ return target
+
+ def resize(
+ self,
+ image: torch.Tensor,
+ size: SizeDict,
+ interpolation: "F.InterpolationMode" = None,
+ **kwargs,
+ ) -> torch.Tensor:
+ """
+ Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+ int, smaller edge of the image will be matched to this number.
+
+ Args:
+ image (`torch.Tensor`):
+ Image to resize.
+ size (`SizeDict`):
+ Size of the image's `(height, width)` dimensions after resizing. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+ Resampling filter to use if resizing the image.
+ """
+ interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+ if size.shortest_edge and size.longest_edge:
+ # Resize the image so that the shortest edge or the longest edge is of the given size
+ # while maintaining the aspect ratio of the original image.
+ new_size = get_size_with_aspect_ratio(
+ image.size()[-2:],
+ size["shortest_edge"],
+ size["longest_edge"],
+ )
+ elif size.max_height and size.max_width:
+ new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+ elif size.height and size.width:
+ new_size = (size["height"], size["width"])
+ else:
+ raise ValueError(
+ "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+ f" {size.keys()}."
+ )
+
+ image = F.resize(
+ image,
+ size=new_size,
+ interpolation=interpolation,
+ **kwargs,
+ )
+ return image
+
+ def resize_annotation(
+ self,
+ annotation: Dict[str, Any],
+ orig_size: Tuple[int, int],
+ target_size: Tuple[int, int],
+ threshold: float = 0.5,
+ interpolation: "F.InterpolationMode" = None,
+ ):
+ """
+ Resizes an annotation to a target size.
+
+ Args:
+ annotation (`Dict[str, Any]`):
+ The annotation dictionary.
+ orig_size (`Tuple[int, int]`):
+ The original size of the input image.
+ target_size (`Tuple[int, int]`):
+ The target size of the image, as returned by the preprocessing `resize` step.
+ threshold (`float`, *optional*, defaults to 0.5):
+ The threshold used to binarize the segmentation masks.
+ resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
+ The resampling filter to use when resizing the masks.
+ """
+ interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
+ ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+ new_annotation = {}
+ new_annotation["size"] = target_size
+
+ for key, value in annotation.items():
+ if key == "boxes":
+ boxes = value
+ scaled_boxes = boxes * torch.as_tensor(
+ [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+ )
+ new_annotation["boxes"] = scaled_boxes
+ elif key == "area":
+ area = value
+ scaled_area = area * (ratio_width * ratio_height)
+ new_annotation["area"] = scaled_area
+ elif key == "masks":
+ masks = value[:, None]
+ masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+ masks = torch.stack(masks).to(torch.float32)
+ masks = masks[:, 0] > threshold
+ new_annotation["masks"] = masks
+ elif key == "size":
+ new_annotation["size"] = target_size
+ else:
+ new_annotation[key] = value
+
+ return new_annotation
+
+ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+ image_height, image_width = image_size
+ norm_annotation = {}
+ for key, value in annotation.items():
+ if key == "boxes":
+ boxes = value
+ boxes = corners_to_center_format(boxes)
+ boxes /= torch.as_tensor(
+ [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+ )
+ norm_annotation[key] = boxes
+ else:
+ norm_annotation[key] = value
+ return norm_annotation
+
+ def _update_annotation_for_padded_image(
+ self,
+ annotation: Dict,
+ input_image_size: Tuple[int, int],
+ output_image_size: Tuple[int, int],
+ padding,
+ update_bboxes,
+ ) -> Dict:
+ """
+ Update the annotation for a padded image.
+ """
+ new_annotation = {}
+ new_annotation["size"] = output_image_size
+ ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+ for key, value in annotation.items():
+ if key == "masks":
+ masks = value
+ masks = F.pad(
+ masks,
+ padding,
+ fill=0,
+ )
+ masks = safe_squeeze(masks, 1)
+ new_annotation["masks"] = masks
+ elif key == "boxes" and update_bboxes:
+ boxes = value
+ boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+ new_annotation["boxes"] = boxes
+ elif key == "size":
+ new_annotation["size"] = output_image_size
+ else:
+ new_annotation[key] = value
+ return new_annotation
+
+ def pad(
+ self,
+ image: torch.Tensor,
+ padded_size: Tuple[int, int],
+ annotation: Optional[Dict[str, Any]] = None,
+ update_bboxes: bool = True,
+ fill: int = 0,
+ ):
+ original_size = image.size()[-2:]
+ padding_bottom = padded_size[0] - original_size[0]
+ padding_right = padded_size[1] - original_size[1]
+ if padding_bottom < 0 or padding_right < 0:
+ raise ValueError(
+ f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+ f"original size. Got padded size: {padded_size}, original size: {original_size}."
+ )
+ if original_size != padded_size:
+ padding = [0, 0, padding_right, padding_bottom]
+ image = F.pad(image, padding, fill=fill)
+ if annotation is not None:
+ annotation = self._update_annotation_for_padded_image(
+ annotation, original_size, padded_size, padding, update_bboxes
+ )
+
+ # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+ pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+ pixel_mask[: original_size[0], : original_size[1]] = 1
+
+ return image, pixel_mask, annotation
+
+ @functools.lru_cache(maxsize=1)
+ def _validate_input_arguments(
+ self,
+ do_rescale: bool,
+ rescale_factor: float,
+ do_normalize: bool,
+ image_mean: Union[float, List[float]],
+ image_std: Union[float, List[float]],
+ do_resize: bool,
+ size: Dict[str, int],
+ resample: "PILImageResampling",
+ data_format: Union[str, ChannelDimension],
+ return_tensors: Union[TensorType, str],
+ ):
+ if return_tensors != "pt":
+ raise ValueError("Only returning PyTorch tensors is currently supported.")
+
+ if data_format != ChannelDimension.FIRST:
+ raise ValueError("Only channel first data format is currently supported.")
+
+ if do_resize and None in (size, resample):
+ raise ValueError("Size and resample must be specified if do_resize is True.")
+
+ if do_rescale and rescale_factor is None:
+ raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+ if do_normalize and None in (image_mean, image_std):
+ raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+
+ @filter_out_non_signature_kwargs(extra=["device"])
+ def preprocess(
+ self,
+ images: ImageInput,
+ annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+ return_segmentation_masks: bool = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ do_resize: Optional[bool] = None,
+ size: Optional[Dict[str, int]] = None,
+ resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None,
+ do_rescale: Optional[bool] = None,
+ rescale_factor: Optional[Union[int, float]] = None,
+ do_normalize: Optional[bool] = None,
+ do_convert_annotations: Optional[bool] = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ do_pad: Optional[bool] = None,
+ format: Optional[Union[str, AnnotationFormat]] = None,
+ return_tensors: Optional[Union[TensorType, str]] = None,
+ data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ pad_size: Optional[Dict[str, int]] = None,
+ **kwargs,
+ ) -> BatchFeature:
+ """
+ Preprocess an image or a batch of images so that it can be used by the model.
+
+ Args:
+ images (`ImageInput`):
+ Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+ from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+ annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+ List of annotations associated with the image or batch of images. If annotation is for object
+ detection, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+ dictionary. An image can have no annotations, in which case the list should be empty.
+ If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+ An image can have no segments, in which case the list should be empty.
+ - "file_name" (`str`): The file name of the image.
+ return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+ Whether to return segmentation masks.
+ masks_path (`str` or `pathlib.Path`, *optional*):
+ Path to the directory containing the segmentation masks.
+ do_resize (`bool`, *optional*, defaults to self.do_resize):
+ Whether to resize the image.
+ size (`Dict[str, int]`, *optional*, defaults to self.size):
+ Size of the image's `(height, width)` dimensions after resizing. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample):
+ Resampling filter to use when resizing the image.
+ do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+ Whether to rescale the image.
+ rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+ Rescale factor to use when rescaling the image.
+ do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+ Whether to normalize the image.
+ do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+ Whether to convert the annotations to the format expected by the model. Converts the bounding
+ boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+ and in relative coordinates.
+ image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+ Mean to use when normalizing the image.
+ image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+ Standard deviation to use when normalizing the image.
+ do_pad (`bool`, *optional*, defaults to self.do_pad):
+ Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+ the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+ dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+ format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+ Format of the annotations.
+ return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+ Type of tensors to return. If `None`, will return the list of images.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+ do_resize = self.do_resize if do_resize is None else do_resize
+ size = self.size if size is None else size
+ size = get_size_dict(size=size, default_to_square=True)
+ resample = self.resample if resample is None else resample
+ do_rescale = self.do_rescale if do_rescale is None else do_rescale
+ rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+ do_normalize = self.do_normalize if do_normalize is None else do_normalize
+ image_mean = self.image_mean if image_mean is None else image_mean
+ image_std = self.image_std if image_std is None else image_std
+ do_convert_annotations = (
+ self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+ )
+ do_pad = self.do_pad if do_pad is None else do_pad
+ pad_size = self.pad_size if pad_size is None else pad_size
+ format = self.format if format is None else format
+ return_tensors = "pt" if return_tensors is None else return_tensors
+ device = kwargs.pop("device", None)
+
+ # Make hashable for cache
+ size = SizeDict(**size)
+ image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
+ image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+
+ images = make_list_of_images(images)
+ image_type = get_image_type(images[0])
+
+ if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+ raise ValueError(f"Unsupported input image type {image_type}")
+
+ self._validate_input_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ return_tensors=return_tensors,
+ data_format=data_format,
+ )
+
+ if annotations is not None and isinstance(annotations, dict):
+ annotations = [annotations]
+
+ if annotations is not None and len(images) != len(annotations):
+ raise ValueError(
+ f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+ )
+
+ format = AnnotationFormat(format)
+ if annotations is not None:
+ validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+ data = {}
+ if image_type == ImageType.PIL:
+ images = [F.pil_to_tensor(image) for image in images]
+ elif image_type == ImageType.NUMPY:
+ # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
+ images = [torch.from_numpy(image).contiguous() for image in images]
+
+ if device is not None:
+ images = [image.to(device) for image in images]
+
+ # We assume that all images have the same channel dimension format.
+ if input_data_format is None:
+ input_data_format = infer_channel_dimension_format(images[0])
+ if input_data_format == ChannelDimension.LAST:
+ images = [image.permute(2, 0, 1).contiguous() for image in images]
+ input_data_format = ChannelDimension.FIRST
+
+ if do_rescale and do_normalize:
+ # fused rescale and normalize
+ new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor)
+ new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor)
+
+ processed_images = []
+ processed_annotations = []
+ pixel_masks = [] # Initialize pixel_masks here
+ for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+ # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+ if annotations is not None:
+ annotation = self.prepare_annotation(
+ image,
+ annotation,
+ format,
+ return_segmentation_masks=return_segmentation_masks,
+ masks_path=masks_path,
+ input_data_format=input_data_format,
+ )
+
+ if do_resize:
+ interpolation = (
+ pil_torch_interpolation_mapping[resample]
+ if isinstance(resample, (PILImageResampling, int))
+ else resample
+ )
+ resized_image = self.resize(image, size=size, interpolation=interpolation)
+ if annotations is not None:
+ annotation = self.resize_annotation(
+ annotation,
+ orig_size=image.size()[-2:],
+ target_size=resized_image.size()[-2:],
+ )
+ image = resized_image
+
+ if do_rescale and do_normalize:
+ # fused rescale and normalize
+ image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std)
+ elif do_rescale:
+ image = image * rescale_factor
+ elif do_normalize:
+ image = F.normalize(image, image_mean, image_std)
+
+ if do_convert_annotations and annotations is not None:
+ annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+
+ processed_images.append(image)
+ processed_annotations.append(annotation)
+ images = processed_images
+ annotations = processed_annotations if annotations is not None else None
+
+ if do_pad:
+ # depends on all resized image shapes so we need another loop
+ if pad_size is not None:
+ padded_size = (pad_size["height"], pad_size["width"])
+ else:
+ padded_size = get_max_height_width(images)
+
+ padded_images = []
+ padded_annotations = []
+ for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+ # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+ if padded_size == image.size()[-2:]:
+ padded_images.append(image)
+ pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+ padded_annotations.append(annotation)
+ continue
+ image, pixel_mask, annotation = self.pad(
+ image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+ )
+ padded_images.append(image)
+ padded_annotations.append(annotation)
+ pixel_masks.append(pixel_mask)
+ images = padded_images
+ annotations = padded_annotations if annotations is not None else None
+ data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+ data.update({"pixel_values": torch.stack(images, dim=0)})
+ encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+ if annotations is not None:
+ encoded_inputs["labels"] = [
+ BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+ ]
+ return encoded_inputs
+
+ def post_process_object_detection(
+ self,
+ outputs,
+ threshold: float = 0.5,
+ target_sizes: Union[TensorType, List[Tuple]] = None,
+ use_focal_loss: bool = True,
+ ):
+ """
+ Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+ bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+ Args:
+ outputs ([`DetrObjectDetectionOutput`]):
+ Raw outputs of the model.
+ threshold (`float`, *optional*, defaults to 0.5):
+ Score threshold to keep object detection predictions.
+ target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+ Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+ `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+ use_focal_loss (`bool` defaults to `True`):
+ Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
+ to compute the scores of each detection, otherwise, a softmax function is used.
+
+ Returns:
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+ in the batch as predicted by the model.
+ """
+ requires_backends(self, ["torch"])
+ out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+ # convert from relative cxcywh to absolute xyxy
+ boxes = center_to_corners_format(out_bbox)
+ if target_sizes is not None:
+ if len(out_logits) != len(target_sizes):
+ raise ValueError(
+ "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+ )
+ if isinstance(target_sizes, List):
+ img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
+ else:
+ img_h, img_w = target_sizes.unbind(1)
+ scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+ boxes = boxes * scale_fct[:, None, :]
+
+ num_top_queries = out_logits.shape[1]
+ num_classes = out_logits.shape[2]
+
+ if use_focal_loss:
+ scores = torch.nn.functional.sigmoid(out_logits)
+ scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
+ labels = index % num_classes
+ index = index // num_classes
+ boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+ else:
+ scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+ scores, labels = scores.max(dim=-1)
+ if scores.shape[1] > num_top_queries:
+ scores, index = torch.topk(scores, num_top_queries, dim=-1)
+ labels = torch.gather(labels, dim=1, index=index)
+ boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+ results = []
+ for score, label, box in zip(scores, labels, boxes):
+ results.append(
+ {
+ "scores": score[score > threshold],
+ "labels": label[score > threshold],
+ "boxes": box[score > threshold],
+ }
+ )
+
+ return results
+
+
+__all__ = ["RTDetrImageProcessorFast"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/modeling_rt_detr.py b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/modeling_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..da395743f60d0026e9df50b3219ae10ec19ad4f5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/modeling_rt_detr.py
@@ -0,0 +1,2171 @@
+# coding=utf-8
+# Copyright 2024 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RT-DETR model."""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from functools import lru_cache, partial, wraps
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ...activations import ACT2CLS, ACT2FN
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_ninja_available,
+ is_torch_cuda_available,
+ logging,
+ replace_return_docstrings,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_rt_detr import RTDetrConfig
+
+
+logger = logging.get_logger(__name__)
+
+MultiScaleDeformableAttention = None
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.load_cuda_kernels
+def load_cuda_kernels():
+ from torch.utils.cpp_extension import load
+
+ global MultiScaleDeformableAttention
+
+ root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
+ src_files = [
+ root / filename
+ for filename in [
+ "vision.cpp",
+ os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
+ os.path.join("cuda", "ms_deform_attn_cuda.cu"),
+ ]
+ ]
+
+ MultiScaleDeformableAttention = load(
+ "MultiScaleDeformableAttention",
+ src_files,
+ with_cuda=True,
+ extra_include_paths=[str(root)],
+ extra_cflags=["-DWITH_CUDA=1"],
+ extra_cuda_cflags=[
+ "-DCUDA_HAS_FP16=1",
+ "-D__CUDA_NO_HALF_OPERATORS__",
+ "-D__CUDA_NO_HALF_CONVERSIONS__",
+ "-D__CUDA_NO_HALF2_OPERATORS__",
+ ],
+ )
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
+class MultiScaleDeformableAttentionFunction(Function):
+ @staticmethod
+ def forward(
+ context,
+ value,
+ value_spatial_shapes,
+ value_level_start_index,
+ sampling_locations,
+ attention_weights,
+ im2col_step,
+ ):
+ context.im2col_step = im2col_step
+ output = MultiScaleDeformableAttention.ms_deform_attn_forward(
+ value,
+ value_spatial_shapes,
+ value_level_start_index,
+ sampling_locations,
+ attention_weights,
+ context.im2col_step,
+ )
+ context.save_for_backward(
+ value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
+ )
+ return output
+
+ @staticmethod
+ @once_differentiable
+ def backward(context, grad_output):
+ (
+ value,
+ value_spatial_shapes,
+ value_level_start_index,
+ sampling_locations,
+ attention_weights,
+ ) = context.saved_tensors
+ grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
+ value,
+ value_spatial_shapes,
+ value_level_start_index,
+ sampling_locations,
+ attention_weights,
+ grad_output,
+ context.im2col_step,
+ )
+
+ return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "RTDetrConfig"
+# TODO: Replace all occurrences of the checkpoint with the final one
+_CHECKPOINT_FOR_DOC = "PekingU/rtdetr_r50vd"
+
+
+@dataclass
+class RTDetrDecoderOutput(ModelOutput):
+ """
+ Base class for outputs of the RTDetrDecoder. This class adds two attributes to
+ BaseModelOutputWithCrossAttentions, namely:
+ - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+ - a stacked tensor of intermediate reference points.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+ Stacked intermediate hidden states (output of each layer of the decoder).
+ intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
+ Stacked intermediate logits (logits of each layer of the decoder).
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+ Stacked intermediate reference points (reference points of each layer of the decoder).
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+ plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ the self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+ used to compute the weighted average in the cross-attention heads.
+ """
+
+ last_hidden_state: torch.FloatTensor = None
+ intermediate_hidden_states: torch.FloatTensor = None
+ intermediate_logits: torch.FloatTensor = None
+ intermediate_reference_points: torch.FloatTensor = None
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
+ cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class RTDetrModelOutput(ModelOutput):
+ """
+ Base class for outputs of the RT-DETR encoder-decoder model.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+ Stacked intermediate hidden states (output of each layer of the decoder).
+ intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
+ Stacked intermediate logits (logits of each layer of the decoder).
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+ Stacked intermediate reference points (reference points of each layer of the decoder).
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+ plus the initial embedding outputs.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+ num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+ average in the self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+ layer plus the initial embedding outputs.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+ Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+ Initial reference points sent through the Transformer decoder.
+ enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+ picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e.
+ foreground and background).
+ enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`):
+ Logits of predicted bounding boxes coordinates in the encoder stage.
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+ foreground and background).
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Logits of predicted bounding boxes coordinates in the first stage.
+ denoising_meta_values (`dict`):
+ Extra dictionary for the denoising related values
+ """
+
+ last_hidden_state: torch.FloatTensor = None
+ intermediate_hidden_states: torch.FloatTensor = None
+ intermediate_logits: torch.FloatTensor = None
+ intermediate_reference_points: torch.FloatTensor = None
+ decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ init_reference_points: torch.FloatTensor = None
+ enc_topk_logits: Optional[torch.FloatTensor] = None
+ enc_topk_bboxes: Optional[torch.FloatTensor] = None
+ enc_outputs_class: Optional[torch.FloatTensor] = None
+ enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+ denoising_meta_values: Optional[Dict] = None
+
+
+@dataclass
+class RTDetrObjectDetectionOutput(ModelOutput):
+ """
+ Output type of [`RTDetrForObjectDetection`].
+
+ Args:
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+ Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+ bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+ scale-invariant IoU loss.
+ loss_dict (`Dict`, *optional*):
+ A dictionary containing the individual losses. Useful for logging.
+ logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+ Classification logits (including no-object) for all queries.
+ pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+ Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+ values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+ possible padding). You can use [`~RTDetrImageProcessor.post_process_object_detection`] to retrieve the
+ unnormalized (absolute) bounding boxes.
+ auxiliary_outputs (`list[Dict]`, *optional*):
+ Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+ and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+ `pred_boxes`) for each decoder layer.
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+ Stacked intermediate hidden states (output of each layer of the decoder).
+ intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`):
+ Stacked intermediate logits (logits of each layer of the decoder).
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+ Stacked intermediate reference points (reference points of each layer of the decoder).
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+ plus the initial embedding outputs.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+ num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+ average in the self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+ layer plus the initial embedding outputs.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+ Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+ Initial reference points sent through the Transformer decoder.
+ enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Logits of predicted bounding boxes coordinates in the encoder.
+ enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Logits of predicted bounding boxes coordinates in the encoder.
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+ foreground and background).
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Logits of predicted bounding boxes coordinates in the first stage.
+ denoising_meta_values (`dict`):
+ Extra dictionary for the denoising related values
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ loss_dict: Optional[Dict] = None
+ logits: torch.FloatTensor = None
+ pred_boxes: torch.FloatTensor = None
+ auxiliary_outputs: Optional[List[Dict]] = None
+ last_hidden_state: torch.FloatTensor = None
+ intermediate_hidden_states: torch.FloatTensor = None
+ intermediate_logits: torch.FloatTensor = None
+ intermediate_reference_points: torch.FloatTensor = None
+ decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ init_reference_points: Optional[Tuple[torch.FloatTensor]] = None
+ enc_topk_logits: Optional[torch.FloatTensor] = None
+ enc_topk_bboxes: Optional[torch.FloatTensor] = None
+ enc_outputs_class: Optional[torch.FloatTensor] = None
+ enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+ denoising_meta_values: Optional[Dict] = None
+
+
+def _get_clones(partial_module, N):
+ return nn.ModuleList([partial_module() for i in range(N)])
+
+
+# Copied from transformers.models.conditional_detr.modeling_conditional_detr.inverse_sigmoid
+def inverse_sigmoid(x, eps=1e-5):
+ x = x.clamp(min=0, max=1)
+ x1 = x.clamp(min=eps)
+ x2 = (1 - x).clamp(min=eps)
+ return torch.log(x1 / x2)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->RTDetr
+class RTDetrFrozenBatchNorm2d(nn.Module):
+ """
+ BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+ Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+ torchvision.models.resnet[18,34,50,101] produce nans.
+ """
+
+ def __init__(self, n):
+ super().__init__()
+ self.register_buffer("weight", torch.ones(n))
+ self.register_buffer("bias", torch.zeros(n))
+ self.register_buffer("running_mean", torch.zeros(n))
+ self.register_buffer("running_var", torch.ones(n))
+
+ def _load_from_state_dict(
+ self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+ ):
+ num_batches_tracked_key = prefix + "num_batches_tracked"
+ if num_batches_tracked_key in state_dict:
+ del state_dict[num_batches_tracked_key]
+
+ super()._load_from_state_dict(
+ state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+ )
+
+ def forward(self, x):
+ # move reshapes to the beginning
+ # to make it user-friendly
+ weight = self.weight.reshape(1, -1, 1, 1)
+ bias = self.bias.reshape(1, -1, 1, 1)
+ running_var = self.running_var.reshape(1, -1, 1, 1)
+ running_mean = self.running_mean.reshape(1, -1, 1, 1)
+ epsilon = 1e-5
+ scale = weight * (running_var + epsilon).rsqrt()
+ bias = bias - running_mean * scale
+ return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->RTDetr
+def replace_batch_norm(model):
+ r"""
+ Recursively replace all `torch.nn.BatchNorm2d` with `RTDetrFrozenBatchNorm2d`.
+
+ Args:
+ model (torch.nn.Module):
+ input model
+ """
+ for name, module in model.named_children():
+ if isinstance(module, nn.BatchNorm2d):
+ new_module = RTDetrFrozenBatchNorm2d(module.num_features)
+
+ if not module.weight.device == torch.device("meta"):
+ new_module.weight.data.copy_(module.weight)
+ new_module.bias.data.copy_(module.bias)
+ new_module.running_mean.data.copy_(module.running_mean)
+ new_module.running_var.data.copy_(module.running_var)
+
+ model._modules[name] = new_module
+
+ if len(list(module.children())) > 0:
+ replace_batch_norm(module)
+
+
+def get_contrastive_denoising_training_group(
+ targets,
+ num_classes,
+ num_queries,
+ class_embed,
+ num_denoising_queries=100,
+ label_noise_ratio=0.5,
+ box_noise_scale=1.0,
+):
+ """
+ Creates a contrastive denoising training group using ground-truth samples. It adds noise to labels and boxes.
+
+ Args:
+ targets (`List[dict]`):
+ The target objects, each containing 'class_labels' and 'boxes' for objects in an image.
+ num_classes (`int`):
+ Total number of classes in the dataset.
+ num_queries (`int`):
+ Number of query slots in the transformer.
+ class_embed (`callable`):
+ A function or a model layer to embed class labels.
+ num_denoising_queries (`int`, *optional*, defaults to 100):
+ Number of denoising queries.
+ label_noise_ratio (`float`, *optional*, defaults to 0.5):
+ Ratio of noise applied to labels.
+ box_noise_scale (`float`, *optional*, defaults to 1.0):
+ Scale of noise applied to bounding boxes.
+ Returns:
+ `tuple` comprising various elements:
+ - **input_query_class** (`torch.FloatTensor`) --
+ Class queries with applied label noise.
+ - **input_query_bbox** (`torch.FloatTensor`) --
+ Bounding box queries with applied box noise.
+ - **attn_mask** (`torch.FloatTensor`) --
+ Attention mask for separating denoising and reconstruction queries.
+ - **denoising_meta_values** (`dict`) --
+ Metadata including denoising positive indices, number of groups, and split sizes.
+ """
+
+ if num_denoising_queries <= 0:
+ return None, None, None, None
+
+ num_ground_truths = [len(t["class_labels"]) for t in targets]
+ device = targets[0]["class_labels"].device
+
+ max_gt_num = max(num_ground_truths)
+ if max_gt_num == 0:
+ return None, None, None, None
+
+ num_groups_denoising_queries = num_denoising_queries // max_gt_num
+ num_groups_denoising_queries = 1 if num_groups_denoising_queries == 0 else num_groups_denoising_queries
+ # pad gt to max_num of a batch
+ batch_size = len(num_ground_truths)
+
+ input_query_class = torch.full([batch_size, max_gt_num], num_classes, dtype=torch.int32, device=device)
+ input_query_bbox = torch.zeros([batch_size, max_gt_num, 4], device=device)
+ pad_gt_mask = torch.zeros([batch_size, max_gt_num], dtype=torch.bool, device=device)
+
+ for i in range(batch_size):
+ num_gt = num_ground_truths[i]
+ if num_gt > 0:
+ input_query_class[i, :num_gt] = targets[i]["class_labels"]
+ input_query_bbox[i, :num_gt] = targets[i]["boxes"]
+ pad_gt_mask[i, :num_gt] = 1
+ # each group has positive and negative queries.
+ input_query_class = input_query_class.tile([1, 2 * num_groups_denoising_queries])
+ input_query_bbox = input_query_bbox.tile([1, 2 * num_groups_denoising_queries, 1])
+ pad_gt_mask = pad_gt_mask.tile([1, 2 * num_groups_denoising_queries])
+ # positive and negative mask
+ negative_gt_mask = torch.zeros([batch_size, max_gt_num * 2, 1], device=device)
+ negative_gt_mask[:, max_gt_num:] = 1
+ negative_gt_mask = negative_gt_mask.tile([1, num_groups_denoising_queries, 1])
+ positive_gt_mask = 1 - negative_gt_mask
+ # contrastive denoising training positive index
+ positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+ denoise_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
+ denoise_positive_idx = torch.split(
+ denoise_positive_idx, [n * num_groups_denoising_queries for n in num_ground_truths]
+ )
+ # total denoising queries
+ num_denoising_queries = int(max_gt_num * 2 * num_groups_denoising_queries)
+
+ if label_noise_ratio > 0:
+ mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
+ # randomly put a new one here
+ new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
+ input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
+
+ if box_noise_scale > 0:
+ known_bbox = center_to_corners_format(input_query_bbox)
+ diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
+ rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+ rand_part = torch.rand_like(input_query_bbox)
+ rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
+ rand_part *= rand_sign
+ known_bbox += rand_part * diff
+ known_bbox.clip_(min=0.0, max=1.0)
+ input_query_bbox = corners_to_center_format(known_bbox)
+ input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+ input_query_class = class_embed(input_query_class)
+
+ target_size = num_denoising_queries + num_queries
+ attn_mask = torch.full([target_size, target_size], False, dtype=torch.bool, device=device)
+ # match query cannot see the reconstruction
+ attn_mask[num_denoising_queries:, :num_denoising_queries] = True
+
+ # reconstructions cannot see each other
+ for i in range(num_groups_denoising_queries):
+ idx_block_start = max_gt_num * 2 * i
+ idx_block_end = max_gt_num * 2 * (i + 1)
+ attn_mask[idx_block_start:idx_block_end, :idx_block_start] = True
+ attn_mask[idx_block_start:idx_block_end, idx_block_end:num_denoising_queries] = True
+
+ denoising_meta_values = {
+ "dn_positive_idx": denoise_positive_idx,
+ "dn_num_group": num_groups_denoising_queries,
+ "dn_num_split": [num_denoising_queries, num_queries],
+ }
+
+ return input_query_class, input_query_bbox, attn_mask, denoising_meta_values
+
+
+class RTDetrConvEncoder(nn.Module):
+ """
+ Convolutional backbone using the modeling_rt_detr_resnet.py.
+
+ nn.BatchNorm2d layers are replaced by RTDetrFrozenBatchNorm2d as defined above.
+ https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetr_pytorch/src/nn/backbone/presnet.py#L142
+ """
+
+ def __init__(self, config):
+ super().__init__()
+
+ backbone = load_backbone(config)
+
+ if config.freeze_backbone_batch_norms:
+ # replace batch norm by frozen batch norm
+ with torch.no_grad():
+ replace_batch_norm(backbone)
+ self.model = backbone
+ self.intermediate_channel_sizes = self.model.channels
+
+ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+ # send pixel_values through the model to get list of feature maps
+ features = self.model(pixel_values).feature_maps
+
+ out = []
+ for feature_map in features:
+ # downsample pixel_mask to match shape of corresponding feature_map
+ mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+ out.append((feature_map, mask))
+ return out
+
+
+class RTDetrConvNormLayer(nn.Module):
+ def __init__(self, config, in_channels, out_channels, kernel_size, stride, padding=None, activation=None):
+ super().__init__()
+ self.conv = nn.Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding=(kernel_size - 1) // 2 if padding is None else padding,
+ bias=False,
+ )
+ self.norm = nn.BatchNorm2d(out_channels, config.batch_norm_eps)
+ self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+ def forward(self, hidden_state):
+ hidden_state = self.conv(hidden_state)
+ hidden_state = self.norm(hidden_state)
+ hidden_state = self.activation(hidden_state)
+ return hidden_state
+
+
+class RTDetrEncoderLayer(nn.Module):
+ def __init__(self, config: RTDetrConfig):
+ super().__init__()
+ self.normalize_before = config.normalize_before
+
+ # self-attention
+ self.self_attn = RTDetrMultiheadAttention(
+ embed_dim=config.encoder_hidden_dim,
+ num_heads=config.num_attention_heads,
+ dropout=config.dropout,
+ )
+ self.self_attn_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps)
+ self.dropout = config.dropout
+ self.activation_fn = ACT2FN[config.encoder_activation_function]
+ self.activation_dropout = config.activation_dropout
+ self.fc1 = nn.Linear(config.encoder_hidden_dim, config.encoder_ffn_dim)
+ self.fc2 = nn.Linear(config.encoder_ffn_dim, config.encoder_hidden_dim)
+ self.final_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ position_embeddings: torch.Tensor = None,
+ output_attentions: bool = False,
+ **kwargs,
+ ):
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+ values.
+ position_embeddings (`torch.FloatTensor`, *optional*):
+ Object queries (also called content embeddings), to be added to the hidden states.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+ if self.normalize_before:
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_embeddings=position_embeddings,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ if not self.normalize_before:
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ if self.normalize_before:
+ hidden_states = self.final_layer_norm(hidden_states)
+ residual = hidden_states
+
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+ hidden_states = self.fc2(hidden_states)
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ hidden_states = residual + hidden_states
+ if not self.normalize_before:
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ if self.training:
+ if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+class RTDetrRepVggBlock(nn.Module):
+ """
+ RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again".
+ """
+
+ def __init__(self, config: RTDetrConfig):
+ super().__init__()
+
+ activation = config.activation_function
+ hidden_channels = int(config.encoder_hidden_dim * config.hidden_expansion)
+ self.conv1 = RTDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1, padding=1)
+ self.conv2 = RTDetrConvNormLayer(config, hidden_channels, hidden_channels, 1, 1, padding=0)
+ self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+ def forward(self, x):
+ y = self.conv1(x) + self.conv2(x)
+ return self.activation(y)
+
+
+class RTDetrCSPRepLayer(nn.Module):
+ """
+ Cross Stage Partial (CSP) network layer with RepVGG blocks.
+ """
+
+ def __init__(self, config: RTDetrConfig):
+ super().__init__()
+
+ in_channels = config.encoder_hidden_dim * 2
+ out_channels = config.encoder_hidden_dim
+ num_blocks = 3
+ activation = config.activation_function
+
+ hidden_channels = int(out_channels * config.hidden_expansion)
+ self.conv1 = RTDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+ self.conv2 = RTDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+ self.bottlenecks = nn.Sequential(*[RTDetrRepVggBlock(config) for _ in range(num_blocks)])
+ if hidden_channels != out_channels:
+ self.conv3 = RTDetrConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation)
+ else:
+ self.conv3 = nn.Identity()
+
+ def forward(self, hidden_state):
+ device = hidden_state.device
+ hidden_state_1 = self.conv1(hidden_state)
+ hidden_state_1 = self.bottlenecks(hidden_state_1).to(device)
+ hidden_state_2 = self.conv2(hidden_state).to(device)
+ return self.conv3(hidden_state_1 + hidden_state_2)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
+def multi_scale_deformable_attention(
+ value: Tensor,
+ value_spatial_shapes: Union[Tensor, List[Tuple]],
+ sampling_locations: Tensor,
+ attention_weights: Tensor,
+) -> Tensor:
+ batch_size, _, num_heads, hidden_dim = value.shape
+ _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+ value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
+ sampling_grids = 2 * sampling_locations - 1
+ sampling_value_list = []
+ for level_id, (height, width) in enumerate(value_spatial_shapes):
+ # batch_size, height*width, num_heads, hidden_dim
+ # -> batch_size, height*width, num_heads*hidden_dim
+ # -> batch_size, num_heads*hidden_dim, height*width
+ # -> batch_size*num_heads, hidden_dim, height, width
+ value_l_ = (
+ value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+ )
+ # batch_size, num_queries, num_heads, num_points, 2
+ # -> batch_size, num_heads, num_queries, num_points, 2
+ # -> batch_size*num_heads, num_queries, num_points, 2
+ sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+ # batch_size*num_heads, hidden_dim, num_queries, num_points
+ sampling_value_l_ = nn.functional.grid_sample(
+ value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+ )
+ sampling_value_list.append(sampling_value_l_)
+ # (batch_size, num_queries, num_heads, num_levels, num_points)
+ # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+ # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+ attention_weights = attention_weights.transpose(1, 2).reshape(
+ batch_size * num_heads, 1, num_queries, num_levels * num_points
+ )
+ output = (
+ (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+ .sum(-1)
+ .view(batch_size, num_heads * hidden_dim, num_queries)
+ )
+ return output.transpose(1, 2).contiguous()
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->RTDetr
+class RTDetrMultiscaleDeformableAttention(nn.Module):
+ """
+ Multiscale deformable attention as proposed in Deformable DETR.
+ """
+
+ def __init__(self, config: RTDetrConfig, num_heads: int, n_points: int):
+ super().__init__()
+
+ kernel_loaded = MultiScaleDeformableAttention is not None
+ if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
+ try:
+ load_cuda_kernels()
+ except Exception as e:
+ logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+
+ if config.d_model % num_heads != 0:
+ raise ValueError(
+ f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+ )
+ dim_per_head = config.d_model // num_heads
+ # check if dim_per_head is power of 2
+ if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+ warnings.warn(
+ "You'd better set embed_dim (d_model) in RTDetrMultiscaleDeformableAttention to make the"
+ " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+ " implementation."
+ )
+
+ self.im2col_step = 64
+
+ self.d_model = config.d_model
+ self.n_levels = config.num_feature_levels
+ self.n_heads = num_heads
+ self.n_points = n_points
+
+ self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+ self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+ self.value_proj = nn.Linear(config.d_model, config.d_model)
+ self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+ self.disable_custom_kernels = config.disable_custom_kernels
+
+ def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+ return tensor if position_embeddings is None else tensor + position_embeddings
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ position_embeddings: Optional[torch.Tensor] = None,
+ reference_points=None,
+ spatial_shapes=None,
+ spatial_shapes_list=None,
+ level_start_index=None,
+ output_attentions: bool = False,
+ ):
+ # add position embeddings to the hidden states before projecting to queries and keys
+ if position_embeddings is not None:
+ hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+ batch_size, num_queries, _ = hidden_states.shape
+ batch_size, sequence_length, _ = encoder_hidden_states.shape
+ total_elements = sum(height * width for height, width in spatial_shapes_list)
+ if total_elements != sequence_length:
+ raise ValueError(
+ "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+ )
+
+ value = self.value_proj(encoder_hidden_states)
+ if attention_mask is not None:
+ # we invert the attention_mask
+ value = value.masked_fill(~attention_mask[..., None], float(0))
+ value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+ sampling_offsets = self.sampling_offsets(hidden_states).view(
+ batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+ )
+ attention_weights = self.attention_weights(hidden_states).view(
+ batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+ )
+ attention_weights = F.softmax(attention_weights, -1).view(
+ batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+ )
+ # batch_size, num_queries, n_heads, n_levels, n_points, 2
+ num_coordinates = reference_points.shape[-1]
+ if num_coordinates == 2:
+ offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+ sampling_locations = (
+ reference_points[:, :, None, :, None, :]
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+ )
+ elif num_coordinates == 4:
+ sampling_locations = (
+ reference_points[:, :, None, :, None, :2]
+ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+ )
+ else:
+ raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+ if self.disable_custom_kernels or MultiScaleDeformableAttention is None:
+ # PyTorch implementation
+ output = multi_scale_deformable_attention(
+ value, spatial_shapes_list, sampling_locations, attention_weights
+ )
+ else:
+ try:
+ # custom kernel
+ output = MultiScaleDeformableAttentionFunction.apply(
+ value,
+ spatial_shapes,
+ level_start_index,
+ sampling_locations,
+ attention_weights,
+ self.im2col_step,
+ )
+ except Exception:
+ # PyTorch implementation
+ output = multi_scale_deformable_attention(
+ value, spatial_shapes_list, sampling_locations, attention_weights
+ )
+ output = self.output_proj(output)
+
+ return output, attention_weights
+
+
+class RTDetrMultiheadAttention(nn.Module):
+ """
+ Multi-headed attention from 'Attention Is All You Need' paper.
+
+ Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
+ """
+
+ def __init__(
+ self,
+ embed_dim: int,
+ num_heads: int,
+ dropout: float = 0.0,
+ bias: bool = True,
+ ):
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.head_dim = embed_dim // num_heads
+ if self.head_dim * num_heads != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+ f" {num_heads})."
+ )
+ self.scaling = self.head_dim**-0.5
+
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+ def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+ return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+ return tensor if position_embeddings is None else tensor + position_embeddings
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_embeddings: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+
+ batch_size, target_len, embed_dim = hidden_states.size()
+ # add position embeddings to the hidden states before projecting to queries and keys
+ if position_embeddings is not None:
+ hidden_states_original = hidden_states
+ hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+ # get queries, keys and values
+ query_states = self.q_proj(hidden_states) * self.scaling
+ key_states = self._reshape(self.k_proj(hidden_states), -1, batch_size)
+ value_states = self._reshape(self.v_proj(hidden_states_original), -1, batch_size)
+
+ proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+ query_states = self._reshape(query_states, target_len, batch_size).view(*proj_shape)
+ key_states = key_states.view(*proj_shape)
+ value_states = value_states.view(*proj_shape)
+
+ source_len = key_states.size(1)
+
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+ if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+ raise ValueError(
+ f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ # expand attention_mask
+ if attention_mask is not None:
+ # [seq_len, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+ attention_mask = attention_mask.expand(batch_size, 1, *attention_mask.size())
+
+ if attention_mask is not None:
+ if attention_mask.size() != (batch_size, 1, target_len, source_len):
+ raise ValueError(
+ f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+ f" {attention_mask.size()}"
+ )
+ attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+ attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+ if output_attentions:
+ # this operation is a bit awkward, but it's required to
+ # make sure that attn_weights keeps its gradient.
+ # In order to do so, attn_weights have to reshaped
+ # twice and have to be reused in the following
+ attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+ attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+ else:
+ attn_weights_reshaped = None
+
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+ attn_output = torch.bmm(attn_probs, value_states)
+
+ if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+ attn_output = attn_output.transpose(1, 2)
+ attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+ attn_output = self.out_proj(attn_output)
+
+ return attn_output, attn_weights_reshaped
+
+
+class RTDetrDecoderLayer(nn.Module):
+ def __init__(self, config: RTDetrConfig):
+ super().__init__()
+ # self-attention
+ self.self_attn = RTDetrMultiheadAttention(
+ embed_dim=config.d_model,
+ num_heads=config.decoder_attention_heads,
+ dropout=config.attention_dropout,
+ )
+ self.dropout = config.dropout
+ self.activation_fn = ACT2FN[config.decoder_activation_function]
+ self.activation_dropout = config.activation_dropout
+
+ self.self_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+ # cross-attention
+ self.encoder_attn = RTDetrMultiscaleDeformableAttention(
+ config,
+ num_heads=config.decoder_attention_heads,
+ n_points=config.decoder_n_points,
+ )
+ self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+ # feedforward neural networks
+ self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim)
+ self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model)
+ self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_embeddings: Optional[torch.Tensor] = None,
+ reference_points=None,
+ spatial_shapes=None,
+ spatial_shapes_list=None,
+ level_start_index=None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = False,
+ ):
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`):
+ Input to the layer of shape `(seq_len, batch, embed_dim)`.
+ position_embeddings (`torch.FloatTensor`, *optional*):
+ Position embeddings that are added to the queries and keys in the self-attention layer.
+ reference_points (`torch.FloatTensor`, *optional*):
+ Reference points.
+ spatial_shapes (`torch.LongTensor`, *optional*):
+ Spatial shapes.
+ level_start_index (`torch.LongTensor`, *optional*):
+ Level start index.
+ encoder_hidden_states (`torch.FloatTensor`):
+ cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+ values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+
+ # Self Attention
+ hidden_states, self_attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=encoder_attention_mask,
+ position_embeddings=position_embeddings,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ second_residual = hidden_states
+
+ # Cross-Attention
+ cross_attn_weights = None
+ hidden_states, cross_attn_weights = self.encoder_attn(
+ hidden_states=hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ position_embeddings=position_embeddings,
+ reference_points=reference_points,
+ spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
+ level_start_index=level_start_index,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = second_residual + hidden_states
+
+ hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+ hidden_states = self.fc2(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights, cross_attn_weights)
+
+ return outputs
+
+
+class RTDetrPreTrainedModel(PreTrainedModel):
+ config_class = RTDetrConfig
+ base_model_prefix = "rt_detr"
+ main_input_name = "pixel_values"
+ _no_split_modules = [r"RTDetrConvEncoder", r"RTDetrEncoderLayer", r"RTDetrDecoderLayer"]
+
+ def _init_weights(self, module):
+ """Initalize the weights"""
+
+ """initialize linear layer bias value according to a given probability value."""
+ if isinstance(module, (RTDetrForObjectDetection, RTDetrDecoder)):
+ if module.class_embed is not None:
+ for layer in module.class_embed:
+ prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+ bias = float(-math.log((1 - prior_prob) / prior_prob))
+ nn.init.xavier_uniform_(layer.weight)
+ nn.init.constant_(layer.bias, bias)
+
+ if module.bbox_embed is not None:
+ for layer in module.bbox_embed:
+ nn.init.constant_(layer.layers[-1].weight, 0)
+ nn.init.constant_(layer.layers[-1].bias, 0)
+
+ if isinstance(module, RTDetrMultiscaleDeformableAttention):
+ nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+ default_dtype = torch.get_default_dtype()
+ thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+ 2.0 * math.pi / module.n_heads
+ )
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+ grid_init = (
+ (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+ .view(module.n_heads, 1, 1, 2)
+ .repeat(1, module.n_levels, module.n_points, 1)
+ )
+ for i in range(module.n_points):
+ grid_init[:, :, i, :] *= i + 1
+ with torch.no_grad():
+ module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+ nn.init.constant_(module.attention_weights.weight.data, 0.0)
+ nn.init.constant_(module.attention_weights.bias.data, 0.0)
+ nn.init.xavier_uniform_(module.value_proj.weight.data)
+ nn.init.constant_(module.value_proj.bias.data, 0.0)
+ nn.init.xavier_uniform_(module.output_proj.weight.data)
+ nn.init.constant_(module.output_proj.bias.data, 0.0)
+
+ if isinstance(module, RTDetrModel):
+ prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+ bias = float(-math.log((1 - prior_prob) / prior_prob))
+ nn.init.xavier_uniform_(module.enc_score_head.weight)
+ nn.init.constant_(module.enc_score_head.bias, bias)
+
+ if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+
+ if hasattr(module, "weight_embedding") and self.config.learn_initial_query:
+ nn.init.xavier_uniform_(module.weight_embedding.weight)
+ if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0:
+ nn.init.xavier_uniform_(module.denoising_class_embed.weight)
+
+
+RTDETR_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`RTDetrConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+RTDETR_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+ [`AutoImageProcessor`]. See [`RTDetrImageProcessor.__call__`] for details.
+ pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+ Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+ - 1 for pixels that are real (i.e. **not masked**),
+ - 0 for pixels that are padding (i.e. **masked**).
+
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+ `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+ hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+ can choose to directly pass a flattened representation of an image.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+ embedded representation.
+ labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+ Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+ following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+ in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class RTDetrEncoder(nn.Module):
+ def __init__(self, config: RTDetrConfig):
+ super().__init__()
+
+ self.layers = nn.ModuleList([RTDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+ def forward(self, src, src_mask=None, pos_embed=None, output_attentions: bool = False) -> torch.Tensor:
+ hidden_states = src
+ for layer in self.layers:
+ hidden_states = layer(
+ hidden_states,
+ attention_mask=src_mask,
+ position_embeddings=pos_embed,
+ output_attentions=output_attentions,
+ )
+ return hidden_states
+
+
+class RTDetrHybridEncoder(nn.Module):
+ """
+ Decoder consisting of a projection layer, a set of `RTDetrEncoder`, a top-down Feature Pyramid Network
+ (FPN) and a bottom-up Path Aggregation Network (PAN). More details on the paper: https://arxiv.org/abs/2304.08069
+
+ Args:
+ config: RTDetrConfig
+ """
+
+ def __init__(self, config: RTDetrConfig):
+ super().__init__()
+ self.config = config
+ self.in_channels = config.encoder_in_channels
+ self.feat_strides = config.feat_strides
+ self.encoder_hidden_dim = config.encoder_hidden_dim
+ self.encode_proj_layers = config.encode_proj_layers
+ self.positional_encoding_temperature = config.positional_encoding_temperature
+ self.eval_size = config.eval_size
+ self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
+ self.out_strides = self.feat_strides
+ activation_function = config.activation_function
+
+ # encoder transformer
+ self.encoder = nn.ModuleList([RTDetrEncoder(config) for _ in range(len(self.encode_proj_layers))])
+ # top-down fpn
+ self.lateral_convs = nn.ModuleList()
+ self.fpn_blocks = nn.ModuleList()
+ for _ in range(len(self.in_channels) - 1, 0, -1):
+ self.lateral_convs.append(
+ RTDetrConvNormLayer(
+ config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1, activation=activation_function
+ )
+ )
+ self.fpn_blocks.append(RTDetrCSPRepLayer(config))
+
+ # bottom-up pan
+ self.downsample_convs = nn.ModuleList()
+ self.pan_blocks = nn.ModuleList()
+ for _ in range(len(self.in_channels) - 1):
+ self.downsample_convs.append(
+ RTDetrConvNormLayer(
+ config, self.encoder_hidden_dim, self.encoder_hidden_dim, 3, 2, activation=activation_function
+ )
+ )
+ self.pan_blocks.append(RTDetrCSPRepLayer(config))
+
+ @staticmethod
+ def build_2d_sincos_position_embedding(
+ width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+ ):
+ grid_w = torch.arange(int(width), dtype=dtype, device=device)
+ grid_h = torch.arange(int(height), dtype=dtype, device=device)
+ grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
+ if embed_dim % 4 != 0:
+ raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
+ pos_dim = embed_dim // 4
+ omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
+ omega = 1.0 / (temperature**omega)
+
+ out_w = grid_w.flatten()[..., None] @ omega[None]
+ out_h = grid_h.flatten()[..., None] @ omega[None]
+
+ return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :]
+
+ def forward(
+ self,
+ inputs_embeds=None,
+ attention_mask=None,
+ position_embeddings=None,
+ spatial_shapes=None,
+ level_start_index=None,
+ valid_ratios=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+ - 1 for pixel features that are real (i.e. **not masked**),
+ - 0 for pixel features that are padding (i.e. **masked**).
+ [What are attention masks?](../glossary#attention-mask)
+ position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Position embeddings that are added to the queries and keys in each self-attention layer.
+ spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+ Spatial shapes of each feature map.
+ level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+ Starting index of each feature map.
+ valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+ Ratio of valid area in each feature level.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ hidden_states = inputs_embeds
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+ # encoder
+ if self.config.encoder_layers > 0:
+ for i, enc_ind in enumerate(self.encode_proj_layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states[enc_ind],)
+ height, width = hidden_states[enc_ind].shape[2:]
+ # flatten [batch, channel, height, width] to [batch, height*width, channel]
+ src_flatten = hidden_states[enc_ind].flatten(2).permute(0, 2, 1)
+ if self.training or self.eval_size is None:
+ pos_embed = self.build_2d_sincos_position_embedding(
+ width,
+ height,
+ self.encoder_hidden_dim,
+ self.positional_encoding_temperature,
+ device=src_flatten.device,
+ dtype=src_flatten.dtype,
+ )
+ else:
+ pos_embed = None
+
+ layer_outputs = self.encoder[i](
+ src_flatten,
+ pos_embed=pos_embed,
+ output_attentions=output_attentions,
+ )
+ hidden_states[enc_ind] = (
+ layer_outputs[0].permute(0, 2, 1).reshape(-1, self.encoder_hidden_dim, height, width).contiguous()
+ )
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states[enc_ind],)
+
+ # broadcasting and fusion
+ fpn_feature_maps = [hidden_states[-1]]
+ for idx in range(len(self.in_channels) - 1, 0, -1):
+ feat_high = fpn_feature_maps[0]
+ feat_low = hidden_states[idx - 1]
+ feat_high = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_high)
+ fpn_feature_maps[0] = feat_high
+ upsample_feat = F.interpolate(feat_high, scale_factor=2.0, mode="nearest")
+ fps_map = self.fpn_blocks[len(self.in_channels) - 1 - idx](torch.concat([upsample_feat, feat_low], dim=1))
+ fpn_feature_maps.insert(0, fps_map)
+
+ fpn_states = [fpn_feature_maps[0]]
+ for idx in range(len(self.in_channels) - 1):
+ feat_low = fpn_states[-1]
+ feat_high = fpn_feature_maps[idx + 1]
+ downsample_feat = self.downsample_convs[idx](feat_low)
+ hidden_states = self.pan_blocks[idx](
+ torch.concat([downsample_feat, feat_high.to(downsample_feat.device)], dim=1)
+ )
+ fpn_states.append(hidden_states)
+
+ if not return_dict:
+ return tuple(v for v in [fpn_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(last_hidden_state=fpn_states, hidden_states=encoder_states, attentions=all_attentions)
+
+
+class RTDetrDecoder(RTDetrPreTrainedModel):
+ def __init__(self, config: RTDetrConfig):
+ super().__init__(config)
+
+ self.dropout = config.dropout
+ self.layers = nn.ModuleList([RTDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+ self.query_pos_head = RTDetrMLPPredictionHead(config, 4, 2 * config.d_model, config.d_model, num_layers=2)
+
+ # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+ self.bbox_embed = None
+ self.class_embed = None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def forward(
+ self,
+ inputs_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ position_embeddings=None,
+ reference_points=None,
+ spatial_shapes=None,
+ spatial_shapes_list=None,
+ level_start_index=None,
+ valid_ratios=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+ The query embeddings that are passed into the decoder.
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+ of the decoder.
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+ in `[0, 1]`:
+ - 1 for pixels that are real (i.e. **not masked**),
+ - 0 for pixels that are padding (i.e. **masked**).
+ position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+ Position embeddings that are added to the queries and keys in each self-attention layer.
+ reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+ Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+ spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+ Spatial shapes of the feature maps.
+ level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+ Indexes for the start of each feature level. In range `[0, sequence_length]`.
+ valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+ Ratio of valid area in each feature level.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if inputs_embeds is not None:
+ hidden_states = inputs_embeds
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+ intermediate = ()
+ intermediate_reference_points = ()
+ intermediate_logits = ()
+
+ reference_points = F.sigmoid(reference_points)
+
+ # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L252
+ for idx, decoder_layer in enumerate(self.layers):
+ reference_points_input = reference_points.unsqueeze(2)
+ position_embeddings = self.query_pos_head(reference_points)
+
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ layer_outputs = decoder_layer(
+ hidden_states,
+ position_embeddings=position_embeddings,
+ encoder_hidden_states=encoder_hidden_states,
+ reference_points=reference_points_input,
+ spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
+ level_start_index=level_start_index,
+ encoder_attention_mask=encoder_attention_mask,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ # hack implementation for iterative bounding box refinement
+ if self.bbox_embed is not None:
+ tmp = self.bbox_embed[idx](hidden_states)
+ new_reference_points = F.sigmoid(tmp + inverse_sigmoid(reference_points))
+ reference_points = new_reference_points.detach()
+
+ intermediate += (hidden_states,)
+ intermediate_reference_points += (
+ (new_reference_points,) if self.bbox_embed is not None else (reference_points,)
+ )
+
+ if self.class_embed is not None:
+ logits = self.class_embed[idx](hidden_states)
+ intermediate_logits += (logits,)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ if encoder_hidden_states is not None:
+ all_cross_attentions += (layer_outputs[2],)
+
+ # Keep batch_size as first dimension
+ intermediate = torch.stack(intermediate, dim=1)
+ intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+ if self.class_embed is not None:
+ intermediate_logits = torch.stack(intermediate_logits, dim=1)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if not return_dict:
+ return tuple(
+ v
+ for v in [
+ hidden_states,
+ intermediate,
+ intermediate_logits,
+ intermediate_reference_points,
+ all_hidden_states,
+ all_self_attns,
+ all_cross_attentions,
+ ]
+ if v is not None
+ )
+ return RTDetrDecoderOutput(
+ last_hidden_state=hidden_states,
+ intermediate_hidden_states=intermediate,
+ intermediate_logits=intermediate_logits,
+ intermediate_reference_points=intermediate_reference_points,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ cross_attentions=all_cross_attentions,
+ )
+
+
+def compile_compatible_lru_cache(*lru_args, **lru_kwargs):
+ def decorator(func):
+ @wraps(func)
+ def wrapper(self, *args, **kwargs):
+ if not torch.compiler.is_compiling():
+ # Cache the function only if the model is not being compiled
+ # check if the function is already cached, otherwise create it
+ if not hasattr(self, f"_cached_{func.__name__}"):
+ self.__setattr__(
+ f"_cached_{func.__name__}", lru_cache(*lru_args, **lru_kwargs)(func.__get__(self))
+ )
+ return self.__getattribute__(f"_cached_{func.__name__}")(*args, **kwargs)
+ else:
+ # Otherwise, just call the original function
+ return func(self, *args, **kwargs)
+
+ return wrapper
+
+ return decorator
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class RTDetrMLPPredictionHead(nn.Module):
+ """
+ Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+ height and width of a bounding box w.r.t. an image.
+
+ Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+ Origin from https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_paddle/ppdet/modeling/transformers/utils.py#L453
+
+ """
+
+ def __init__(self, config, input_dim, d_model, output_dim, num_layers):
+ super().__init__()
+ self.num_layers = num_layers
+ h = [d_model] * (num_layers - 1)
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+ def forward(self, x):
+ for i, layer in enumerate(self.layers):
+ x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+ return x
+
+
+@add_start_docstrings(
+ """
+ RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top.
+ """,
+ RTDETR_START_DOCSTRING,
+)
+class RTDetrModel(RTDetrPreTrainedModel):
+ def __init__(self, config: RTDetrConfig):
+ super().__init__(config)
+
+ # Create backbone
+ self.backbone = RTDetrConvEncoder(config)
+ intermediate_channel_sizes = self.backbone.intermediate_channel_sizes
+
+ # Create encoder input projection layers
+ # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py#L212
+ num_backbone_outs = len(intermediate_channel_sizes)
+ encoder_input_proj_list = []
+ for _ in range(num_backbone_outs):
+ in_channels = intermediate_channel_sizes[_]
+ encoder_input_proj_list.append(
+ nn.Sequential(
+ nn.Conv2d(in_channels, config.encoder_hidden_dim, kernel_size=1, bias=False),
+ nn.BatchNorm2d(config.encoder_hidden_dim),
+ )
+ )
+ self.encoder_input_proj = nn.ModuleList(encoder_input_proj_list)
+
+ # Create encoder
+ self.encoder = RTDetrHybridEncoder(config)
+
+ # denoising part
+ if config.num_denoising > 0:
+ self.denoising_class_embed = nn.Embedding(
+ config.num_labels + 1, config.d_model, padding_idx=config.num_labels
+ )
+
+ # decoder embedding
+ if config.learn_initial_query:
+ self.weight_embedding = nn.Embedding(config.num_queries, config.d_model)
+
+ # encoder head
+ self.enc_output = nn.Sequential(
+ nn.Linear(config.d_model, config.d_model),
+ nn.LayerNorm(config.d_model, eps=config.layer_norm_eps),
+ )
+ self.enc_score_head = nn.Linear(config.d_model, config.num_labels)
+ self.enc_bbox_head = RTDetrMLPPredictionHead(config, config.d_model, config.d_model, 4, num_layers=3)
+
+ # init encoder output anchors and valid_mask
+ if config.anchor_image_size:
+ self.anchors, self.valid_mask = self.generate_anchors(dtype=self.dtype)
+
+ # Create decoder input projection layers
+ # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
+ num_backbone_outs = len(config.decoder_in_channels)
+ decoder_input_proj_list = []
+ for _ in range(num_backbone_outs):
+ in_channels = config.decoder_in_channels[_]
+ decoder_input_proj_list.append(
+ nn.Sequential(
+ nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False),
+ nn.BatchNorm2d(config.d_model, config.batch_norm_eps),
+ )
+ )
+ for _ in range(config.num_feature_levels - num_backbone_outs):
+ decoder_input_proj_list.append(
+ nn.Sequential(
+ nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False),
+ nn.BatchNorm2d(config.d_model, config.batch_norm_eps),
+ )
+ )
+ in_channels = config.d_model
+ self.decoder_input_proj = nn.ModuleList(decoder_input_proj_list)
+
+ # decoder
+ self.decoder = RTDetrDecoder(config)
+
+ self.post_init()
+
+ def get_encoder(self):
+ return self.encoder
+
+ def get_decoder(self):
+ return self.decoder
+
+ def freeze_backbone(self):
+ for param in self.backbone.parameters():
+ param.requires_grad_(False)
+
+ def unfreeze_backbone(self):
+ for param in self.backbone.parameters():
+ param.requires_grad_(True)
+
+ @compile_compatible_lru_cache(maxsize=32)
+ def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32):
+ if spatial_shapes is None:
+ spatial_shapes = [
+ [int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)]
+ for s in self.config.feat_strides
+ ]
+ anchors = []
+ for level, (height, width) in enumerate(spatial_shapes):
+ grid_y, grid_x = torch.meshgrid(
+ torch.arange(end=height, dtype=dtype, device=device),
+ torch.arange(end=width, dtype=dtype, device=device),
+ indexing="ij",
+ )
+ grid_xy = torch.stack([grid_x, grid_y], -1)
+ valid_wh = torch.tensor([width, height], device=device).to(dtype)
+ grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_wh
+ wh = torch.ones_like(grid_xy) * grid_size * (2.0**level)
+ anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
+ # define the valid range for anchor coordinates
+ eps = 1e-2
+ anchors = torch.concat(anchors, 1)
+ valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)
+ anchors = torch.log(anchors / (1 - anchors))
+ anchors = torch.where(valid_mask, anchors, torch.tensor(torch.finfo(dtype).max, dtype=dtype, device=device))
+
+ return anchors, valid_mask
+
+ @add_start_docstrings_to_model_forward(RTDETR_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=RTDetrModelOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: torch.FloatTensor,
+ pixel_mask: Optional[torch.LongTensor] = None,
+ encoder_outputs: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[List[dict]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.FloatTensor], RTDetrModelOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, RTDetrModel
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+ >>> model = RTDetrModel.from_pretrained("PekingU/rtdetr_r50vd")
+
+ >>> inputs = image_processor(images=image, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+
+ >>> last_hidden_states = outputs.last_hidden_state
+ >>> list(last_hidden_states.shape)
+ [1, 300, 256]
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ batch_size, num_channels, height, width = pixel_values.shape
+ device = pixel_values.device
+
+ if pixel_mask is None:
+ pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+ features = self.backbone(pixel_values, pixel_mask)
+
+ proj_feats = [self.encoder_input_proj[level](source) for level, (source, mask) in enumerate(features)]
+
+ if encoder_outputs is None:
+ encoder_outputs = self.encoder(
+ proj_feats,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if output_hidden_states else None,
+ attentions=encoder_outputs[2]
+ if len(encoder_outputs) > 2
+ else encoder_outputs[1]
+ if output_attentions
+ else None,
+ )
+
+ # Equivalent to def _get_encoder_input
+ # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
+ sources = []
+ for level, source in enumerate(encoder_outputs[0]):
+ sources.append(self.decoder_input_proj[level](source))
+
+ # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+ if self.config.num_feature_levels > len(sources):
+ _len_sources = len(sources)
+ sources.append(self.decoder_input_proj[_len_sources](encoder_outputs[0])[-1])
+ for i in range(_len_sources + 1, self.config.num_feature_levels):
+ sources.append(self.decoder_input_proj[i](encoder_outputs[0][-1]))
+
+ # Prepare encoder inputs (by flattening)
+ source_flatten = []
+ spatial_shapes_list = []
+ for level, source in enumerate(sources):
+ batch_size, num_channels, height, width = source.shape
+ spatial_shape = (height, width)
+ spatial_shapes_list.append(spatial_shape)
+ source = source.flatten(2).transpose(1, 2)
+ source_flatten.append(source)
+ source_flatten = torch.cat(source_flatten, 1)
+ spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+ # prepare denoising training
+ if self.training and self.config.num_denoising > 0 and labels is not None:
+ (
+ denoising_class,
+ denoising_bbox_unact,
+ attention_mask,
+ denoising_meta_values,
+ ) = get_contrastive_denoising_training_group(
+ targets=labels,
+ num_classes=self.config.num_labels,
+ num_queries=self.config.num_queries,
+ class_embed=self.denoising_class_embed,
+ num_denoising_queries=self.config.num_denoising,
+ label_noise_ratio=self.config.label_noise_ratio,
+ box_noise_scale=self.config.box_noise_scale,
+ )
+ else:
+ denoising_class, denoising_bbox_unact, attention_mask, denoising_meta_values = None, None, None, None
+
+ batch_size = len(source_flatten)
+ device = source_flatten.device
+ dtype = source_flatten.dtype
+
+ # prepare input for decoder
+ if self.training or self.config.anchor_image_size is None:
+ # Pass spatial_shapes as tuple to make it hashable and make sure
+ # lru_cache is working for generate_anchors()
+ spatial_shapes_tuple = tuple(spatial_shapes_list)
+ anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
+ else:
+ anchors, valid_mask = self.anchors, self.valid_mask
+
+ anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
+
+ # use the valid_mask to selectively retain values in the feature map where the mask is `True`
+ memory = valid_mask.to(source_flatten.dtype) * source_flatten
+
+ output_memory = self.enc_output(memory)
+
+ enc_outputs_class = self.enc_score_head(output_memory)
+ enc_outputs_coord_logits = self.enc_bbox_head(output_memory) + anchors
+
+ _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.config.num_queries, dim=1)
+
+ reference_points_unact = enc_outputs_coord_logits.gather(
+ dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_logits.shape[-1])
+ )
+
+ enc_topk_bboxes = F.sigmoid(reference_points_unact)
+ if denoising_bbox_unact is not None:
+ reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
+
+ enc_topk_logits = enc_outputs_class.gather(
+ dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])
+ )
+
+ # extract region features
+ if self.config.learn_initial_query:
+ target = self.weight_embedding.tile([batch_size, 1, 1])
+ else:
+ target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
+ target = target.detach()
+
+ if denoising_class is not None:
+ target = torch.concat([denoising_class, target], 1)
+
+ init_reference_points = reference_points_unact.detach()
+
+ # decoder
+ decoder_outputs = self.decoder(
+ inputs_embeds=target,
+ encoder_hidden_states=source_flatten,
+ encoder_attention_mask=attention_mask,
+ reference_points=init_reference_points,
+ spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
+ level_start_index=level_start_index,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ if not return_dict:
+ enc_outputs = tuple(
+ value
+ for value in [enc_topk_logits, enc_topk_bboxes, enc_outputs_class, enc_outputs_coord_logits]
+ if value is not None
+ )
+ dn_outputs = tuple(value if value is not None else None for value in [denoising_meta_values])
+ tuple_outputs = decoder_outputs + encoder_outputs + (init_reference_points,) + enc_outputs + dn_outputs
+
+ return tuple_outputs
+
+ return RTDetrModelOutput(
+ last_hidden_state=decoder_outputs.last_hidden_state,
+ intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+ intermediate_logits=decoder_outputs.intermediate_logits,
+ intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ init_reference_points=init_reference_points,
+ enc_topk_logits=enc_topk_logits,
+ enc_topk_bboxes=enc_topk_bboxes,
+ enc_outputs_class=enc_outputs_class,
+ enc_outputs_coord_logits=enc_outputs_coord_logits,
+ denoising_meta_values=denoising_meta_values,
+ )
+
+
+@add_start_docstrings(
+ """
+ RT-DETR Model (consisting of a backbone and encoder-decoder) outputting bounding boxes and logits to be further
+ decoded into scores and classes.
+ """,
+ RTDETR_START_DOCSTRING,
+)
+class RTDetrForObjectDetection(RTDetrPreTrainedModel):
+ # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+ _tied_weights_keys = ["bbox_embed", "class_embed"]
+ # We can't initialize the model on meta device as some weights are modified during the initialization
+ _no_split_modules = None
+
+ def __init__(self, config: RTDetrConfig):
+ super().__init__(config)
+
+ # RTDETR encoder-decoder model
+ self.model = RTDetrModel(config)
+
+ # Detection heads on top
+ self.class_embed = partial(nn.Linear, config.d_model, config.num_labels)
+ self.bbox_embed = partial(RTDetrMLPPredictionHead, config, config.d_model, config.d_model, 4, num_layers=3)
+
+ # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+ num_pred = config.decoder_layers
+ if config.with_box_refine:
+ self.class_embed = _get_clones(self.class_embed, num_pred)
+ self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+ else:
+ self.class_embed = nn.ModuleList([self.class_embed() for _ in range(num_pred)])
+ self.bbox_embed = nn.ModuleList([self.bbox_embed() for _ in range(num_pred)])
+
+ # hack implementation for iterative bounding box refinement
+ self.model.decoder.class_embed = self.class_embed
+ self.model.decoder.bbox_embed = self.bbox_embed
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @torch.jit.unused
+ def _set_aux_loss(self, outputs_class, outputs_coord):
+ # this is a workaround to make torchscript happy, as torchscript
+ # doesn't support dictionary with non-homogeneous values, such
+ # as a dict having both a Tensor and a list.
+ return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)]
+
+ @add_start_docstrings_to_model_forward(RTDETR_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=RTDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: torch.FloatTensor,
+ pixel_mask: Optional[torch.LongTensor] = None,
+ encoder_outputs: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[List[dict]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **loss_kwargs,
+ ) -> Union[Tuple[torch.FloatTensor], RTDetrObjectDetectionOutput]:
+ r"""
+ labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+ Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+ following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+ in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import RTDetrImageProcessor, RTDetrForObjectDetection
+ >>> from PIL import Image
+ >>> import requests
+ >>> import torch
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+ >>> model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")
+
+ >>> # prepare image for the model
+ >>> inputs = image_processor(images=image, return_tensors="pt")
+
+ >>> # forward pass
+ >>> outputs = model(**inputs)
+
+ >>> logits = outputs.logits
+ >>> list(logits.shape)
+ [1, 300, 80]
+
+ >>> boxes = outputs.pred_boxes
+ >>> list(boxes.shape)
+ [1, 300, 4]
+
+ >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+ >>> target_sizes = torch.tensor([image.size[::-1]])
+ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
+ ... 0
+ ... ]
+
+ >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+ ... box = [round(i, 2) for i in box.tolist()]
+ ... print(
+ ... f"Detected {model.config.id2label[label.item()]} with confidence "
+ ... f"{round(score.item(), 3)} at location {box}"
+ ... )
+ Detected sofa with confidence 0.97 at location [0.14, 0.38, 640.13, 476.21]
+ Detected cat with confidence 0.96 at location [343.38, 24.28, 640.14, 371.5]
+ Detected cat with confidence 0.958 at location [13.23, 54.18, 318.98, 472.22]
+ Detected remote with confidence 0.951 at location [40.11, 73.44, 175.96, 118.48]
+ Detected remote with confidence 0.924 at location [333.73, 76.58, 369.97, 186.99]
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.model(
+ pixel_values,
+ pixel_mask=pixel_mask,
+ encoder_outputs=encoder_outputs,
+ inputs_embeds=inputs_embeds,
+ decoder_inputs_embeds=decoder_inputs_embeds,
+ labels=labels,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ denoising_meta_values = (
+ outputs.denoising_meta_values if return_dict else outputs[-1] if self.training else None
+ )
+
+ outputs_class = outputs.intermediate_logits if return_dict else outputs[2]
+ outputs_coord = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+ logits = outputs_class[:, -1]
+ pred_boxes = outputs_coord[:, -1]
+
+ loss, loss_dict, auxiliary_outputs, enc_topk_logits, enc_topk_bboxes = None, None, None, None, None
+ if labels is not None:
+ if self.training and denoising_meta_values is not None:
+ enc_topk_logits = outputs.enc_topk_logits if return_dict else outputs[-5]
+ enc_topk_bboxes = outputs.enc_topk_bboxes if return_dict else outputs[-4]
+ loss, loss_dict, auxiliary_outputs = self.loss_function(
+ logits,
+ labels,
+ self.device,
+ pred_boxes,
+ self.config,
+ outputs_class,
+ outputs_coord,
+ enc_topk_logits=enc_topk_logits,
+ enc_topk_bboxes=enc_topk_bboxes,
+ denoising_meta_values=denoising_meta_values,
+ **loss_kwargs,
+ )
+
+ if not return_dict:
+ if auxiliary_outputs is not None:
+ output = (logits, pred_boxes) + (auxiliary_outputs,) + outputs
+ else:
+ output = (logits, pred_boxes) + outputs
+ return ((loss, loss_dict) + output) if loss is not None else output
+
+ return RTDetrObjectDetectionOutput(
+ loss=loss,
+ loss_dict=loss_dict,
+ logits=logits,
+ pred_boxes=pred_boxes,
+ auxiliary_outputs=auxiliary_outputs,
+ last_hidden_state=outputs.last_hidden_state,
+ intermediate_hidden_states=outputs.intermediate_hidden_states,
+ intermediate_logits=outputs.intermediate_logits,
+ intermediate_reference_points=outputs.intermediate_reference_points,
+ decoder_hidden_states=outputs.decoder_hidden_states,
+ decoder_attentions=outputs.decoder_attentions,
+ cross_attentions=outputs.cross_attentions,
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+ encoder_hidden_states=outputs.encoder_hidden_states,
+ encoder_attentions=outputs.encoder_attentions,
+ init_reference_points=outputs.init_reference_points,
+ enc_topk_logits=outputs.enc_topk_logits,
+ enc_topk_bboxes=outputs.enc_topk_bboxes,
+ enc_outputs_class=outputs.enc_outputs_class,
+ enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+ denoising_meta_values=outputs.denoising_meta_values,
+ )
+
+
+__all__ = [
+ "RTDetrForObjectDetection",
+ "RTDetrModel",
+ "RTDetrPreTrainedModel",
+]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..72ef9a0ac4db8b83fc695c61ea4a5612d94d057c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py
@@ -0,0 +1,440 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch RTDetr specific ResNet model. The main difference between hugginface ResNet model is that this RTDetrResNet model forces to use shortcut at the first layer in the resnet-18/34 models.
+See https://github.com/lyuwenyu/RT-DETR/blob/5b628eaa0a2fc25bdafec7e6148d5296b144af85/rtdetr_pytorch/src/nn/backbone/presnet.py#L126 for details.
+"""
+
+import math
+from typing import Optional
+
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+ BackboneOutput,
+ BaseModelOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_rt_detr_resnet import RTDetrResNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "RTDetrResNetConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
+_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetConvLayer -> RTDetrResNetConvLayer
+class RTDetrResNetConvLayer(nn.Module):
+ def __init__(
+ self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu"
+ ):
+ super().__init__()
+ self.convolution = nn.Conv2d(
+ in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=False
+ )
+ self.normalization = nn.BatchNorm2d(out_channels)
+ self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
+
+ def forward(self, input: Tensor) -> Tensor:
+ hidden_state = self.convolution(input)
+ hidden_state = self.normalization(hidden_state)
+ hidden_state = self.activation(hidden_state)
+ return hidden_state
+
+
+class RTDetrResNetEmbeddings(nn.Module):
+ """
+ ResNet Embeddings (stem) composed of a deep aggressive convolution.
+ """
+
+ def __init__(self, config: RTDetrResNetConfig):
+ super().__init__()
+ self.embedder = nn.Sequential(
+ *[
+ RTDetrResNetConvLayer(
+ config.num_channels,
+ config.embedding_size // 2,
+ kernel_size=3,
+ stride=2,
+ activation=config.hidden_act,
+ ),
+ RTDetrResNetConvLayer(
+ config.embedding_size // 2,
+ config.embedding_size // 2,
+ kernel_size=3,
+ stride=1,
+ activation=config.hidden_act,
+ ),
+ RTDetrResNetConvLayer(
+ config.embedding_size // 2,
+ config.embedding_size,
+ kernel_size=3,
+ stride=1,
+ activation=config.hidden_act,
+ ),
+ ]
+ )
+ self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+ self.num_channels = config.num_channels
+
+ def forward(self, pixel_values: Tensor) -> Tensor:
+ num_channels = pixel_values.shape[1]
+ if num_channels != self.num_channels:
+ raise ValueError(
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+ )
+ embedding = self.embedder(pixel_values)
+ embedding = self.pooler(embedding)
+ return embedding
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetShortCut -> RTDetrResNetChortCut
+class RTDetrResNetShortCut(nn.Module):
+ """
+ ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+ downsample the input using `stride=2`.
+ """
+
+ def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
+ super().__init__()
+ self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
+ self.normalization = nn.BatchNorm2d(out_channels)
+
+ def forward(self, input: Tensor) -> Tensor:
+ hidden_state = self.convolution(input)
+ hidden_state = self.normalization(hidden_state)
+ return hidden_state
+
+
+class RTDetrResNetBasicLayer(nn.Module):
+ """
+ A classic ResNet's residual layer composed by two `3x3` convolutions.
+ See https://github.com/lyuwenyu/RT-DETR/blob/5b628eaa0a2fc25bdafec7e6148d5296b144af85/rtdetr_pytorch/src/nn/backbone/presnet.py#L34.
+ """
+
+ def __init__(
+ self,
+ config: RTDetrResNetConfig,
+ in_channels: int,
+ out_channels: int,
+ stride: int = 1,
+ should_apply_shortcut: bool = False,
+ ):
+ super().__init__()
+ if in_channels != out_channels:
+ self.shortcut = (
+ nn.Sequential(
+ *[nn.AvgPool2d(2, 2, 0, ceil_mode=True), RTDetrResNetShortCut(in_channels, out_channels, stride=1)]
+ )
+ if should_apply_shortcut
+ else nn.Identity()
+ )
+ else:
+ self.shortcut = (
+ RTDetrResNetShortCut(in_channels, out_channels, stride=stride)
+ if should_apply_shortcut
+ else nn.Identity()
+ )
+ self.layer = nn.Sequential(
+ RTDetrResNetConvLayer(in_channels, out_channels, stride=stride),
+ RTDetrResNetConvLayer(out_channels, out_channels, activation=None),
+ )
+ self.activation = ACT2FN[config.hidden_act]
+
+ def forward(self, hidden_state):
+ residual = hidden_state
+ hidden_state = self.layer(hidden_state)
+ residual = self.shortcut(residual)
+ hidden_state += residual
+ hidden_state = self.activation(hidden_state)
+ return hidden_state
+
+
+class RTDetrResNetBottleNeckLayer(nn.Module):
+ """
+ A classic RTDetrResNet's bottleneck layer composed by three `3x3` convolutions.
+
+ The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3`
+ convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`. If
+ `downsample_in_bottleneck` is true, downsample will be in the first layer instead of the second layer.
+ """
+
+ def __init__(
+ self,
+ config: RTDetrResNetConfig,
+ in_channels: int,
+ out_channels: int,
+ stride: int = 1,
+ ):
+ super().__init__()
+ reduction = 4
+ should_apply_shortcut = in_channels != out_channels or stride != 1
+ reduces_channels = out_channels // reduction
+ if stride == 2:
+ self.shortcut = nn.Sequential(
+ *[
+ nn.AvgPool2d(2, 2, 0, ceil_mode=True),
+ RTDetrResNetShortCut(in_channels, out_channels, stride=1)
+ if should_apply_shortcut
+ else nn.Identity(),
+ ]
+ )
+ else:
+ self.shortcut = (
+ RTDetrResNetShortCut(in_channels, out_channels, stride=stride)
+ if should_apply_shortcut
+ else nn.Identity()
+ )
+ self.layer = nn.Sequential(
+ RTDetrResNetConvLayer(
+ in_channels, reduces_channels, kernel_size=1, stride=stride if config.downsample_in_bottleneck else 1
+ ),
+ RTDetrResNetConvLayer(
+ reduces_channels, reduces_channels, stride=stride if not config.downsample_in_bottleneck else 1
+ ),
+ RTDetrResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None),
+ )
+ self.activation = ACT2FN[config.hidden_act]
+
+ def forward(self, hidden_state):
+ residual = hidden_state
+ hidden_state = self.layer(hidden_state)
+ residual = self.shortcut(residual)
+ hidden_state += residual
+ hidden_state = self.activation(hidden_state)
+ return hidden_state
+
+
+class RTDetrResNetStage(nn.Module):
+ """
+ A RTDetrResNet stage composed by stacked layers.
+ """
+
+ def __init__(
+ self,
+ config: RTDetrResNetConfig,
+ in_channels: int,
+ out_channels: int,
+ stride: int = 2,
+ depth: int = 2,
+ ):
+ super().__init__()
+
+ layer = RTDetrResNetBottleNeckLayer if config.layer_type == "bottleneck" else RTDetrResNetBasicLayer
+
+ if config.layer_type == "bottleneck":
+ first_layer = layer(
+ config,
+ in_channels,
+ out_channels,
+ stride=stride,
+ )
+ else:
+ first_layer = layer(config, in_channels, out_channels, stride=stride, should_apply_shortcut=True)
+ self.layers = nn.Sequential(
+ first_layer, *[layer(config, out_channels, out_channels) for _ in range(depth - 1)]
+ )
+
+ def forward(self, input: Tensor) -> Tensor:
+ hidden_state = input
+ for layer in self.layers:
+ hidden_state = layer(hidden_state)
+ return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetEncoder with ResNet->RTDetrResNet
+class RTDetrResNetEncoder(nn.Module):
+ def __init__(self, config: RTDetrResNetConfig):
+ super().__init__()
+ self.stages = nn.ModuleList([])
+ # based on `downsample_in_first_stage` the first layer of the first stage may or may not downsample the input
+ self.stages.append(
+ RTDetrResNetStage(
+ config,
+ config.embedding_size,
+ config.hidden_sizes[0],
+ stride=2 if config.downsample_in_first_stage else 1,
+ depth=config.depths[0],
+ )
+ )
+ in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
+ for (in_channels, out_channels), depth in zip(in_out_channels, config.depths[1:]):
+ self.stages.append(RTDetrResNetStage(config, in_channels, out_channels, depth=depth))
+
+ def forward(
+ self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+ ) -> BaseModelOutputWithNoAttention:
+ hidden_states = () if output_hidden_states else None
+
+ for stage_module in self.stages:
+ if output_hidden_states:
+ hidden_states = hidden_states + (hidden_state,)
+
+ hidden_state = stage_module(hidden_state)
+
+ if output_hidden_states:
+ hidden_states = hidden_states + (hidden_state,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+ return BaseModelOutputWithNoAttention(
+ last_hidden_state=hidden_state,
+ hidden_states=hidden_states,
+ )
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->RTDetrResNet
+class RTDetrResNetPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = RTDetrResNetConfig
+ base_model_prefix = "resnet"
+ main_input_name = "pixel_values"
+ _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"]
+
+ def _init_weights(self, module):
+ if isinstance(module, nn.Conv2d):
+ nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+ # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+ elif isinstance(module, nn.Linear):
+ nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+ if module.bias is not None:
+ fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+ bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+ nn.init.uniform_(module.bias, -bound, bound)
+ elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+ nn.init.constant_(module.weight, 1)
+ nn.init.constant_(module.bias, 0)
+
+
+RTDETR_RESNET_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+ as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`RTDetrResNetConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+RTDETR_RESNET_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+ [`RTDetrImageProcessor.__call__`] for details.
+
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ """
+ ResNet backbone, to be used with frameworks like RTDETR.
+ """,
+ RTDETR_RESNET_START_DOCSTRING,
+)
+class RTDetrResNetBackbone(RTDetrResNetPreTrainedModel, BackboneMixin):
+ def __init__(self, config):
+ super().__init__(config)
+ super()._init_backbone(config)
+
+ self.num_features = [config.embedding_size] + config.hidden_sizes
+ self.embedder = RTDetrResNetEmbeddings(config)
+ self.encoder = RTDetrResNetEncoder(config)
+
+ # initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(RTDETR_RESNET_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+ ) -> BackboneOutput:
+ """
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import RTDetrResNetConfig, RTDetrResNetBackbone
+ >>> import torch
+
+ >>> config = RTDetrResNetConfig()
+ >>> model = RTDetrResNetBackbone(config)
+
+ >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+ >>> with torch.no_grad():
+ ... outputs = model(pixel_values)
+
+ >>> feature_maps = outputs.feature_maps
+ >>> list(feature_maps[-1].shape)
+ [1, 2048, 7, 7]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+
+ embedding_output = self.embedder(pixel_values)
+
+ outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True)
+
+ hidden_states = outputs.hidden_states
+
+ feature_maps = ()
+ for idx, stage in enumerate(self.stage_names):
+ if stage in self.out_features:
+ feature_maps += (hidden_states[idx],)
+
+ if not return_dict:
+ output = (feature_maps,)
+ if output_hidden_states:
+ output += (outputs.hidden_states,)
+ return output
+
+ return BackboneOutput(
+ feature_maps=feature_maps,
+ hidden_states=outputs.hidden_states if output_hidden_states else None,
+ attentions=None,
+ )
+
+
+__all__ = [
+ "RTDetrResNetBackbone",
+ "RTDetrResNetPreTrainedModel",
+]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/modular_rt_detr.py b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/modular_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f2ea566e974761d9f694c667f0b5406cc4a6c5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/rt_detr/modular_rt_detr.py
@@ -0,0 +1,577 @@
+import pathlib
+from typing import Dict, List, Optional, Tuple, Union
+
+from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+ BaseImageProcessorFast,
+ SizeDict,
+ get_max_height_width,
+)
+from ...image_transforms import center_to_corners_format
+from ...image_utils import (
+ IMAGENET_DEFAULT_MEAN,
+ IMAGENET_DEFAULT_STD,
+ AnnotationFormat,
+ AnnotationType,
+ ChannelDimension,
+ ImageInput,
+ ImageType,
+ PILImageResampling,
+ get_image_size,
+ get_image_type,
+ infer_channel_dimension_format,
+ make_list_of_images,
+ validate_annotations,
+)
+from ...utils import (
+ TensorType,
+ filter_out_non_signature_kwargs,
+ is_torch_available,
+ is_torchvision_available,
+ is_torchvision_v2_available,
+ is_vision_available,
+ logging,
+ requires_backends,
+)
+
+
+if is_torch_available():
+ import torch
+
+if is_vision_available():
+ from ...image_utils import pil_torch_interpolation_mapping
+
+
+if is_torchvision_v2_available():
+ from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+ from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
+
+
+def prepare_coco_detection_annotation(
+ image,
+ target,
+ return_segmentation_masks: bool = False,
+ input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+ """
+ Convert the target in COCO format into the format expected by RT-DETR.
+ """
+ image_height, image_width = image.size()[-2:]
+
+ image_id = target["image_id"]
+ image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+ # Get all COCO annotations for the given image.
+ annotations = target["annotations"]
+ classes = []
+ area = []
+ boxes = []
+ keypoints = []
+ for obj in annotations:
+ if "iscrowd" not in obj or obj["iscrowd"] == 0:
+ classes.append(obj["category_id"])
+ area.append(obj["area"])
+ boxes.append(obj["bbox"])
+ if "keypoints" in obj:
+ keypoints.append(obj["keypoints"])
+
+ classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+ area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+ iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+ # guard against no boxes via resizing
+ boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+ boxes[:, 2:] += boxes[:, :2]
+ boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+ boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+ keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+ new_target = {
+ "image_id": image_id,
+ "class_labels": classes[keep],
+ "boxes": boxes[keep],
+ "area": area[keep],
+ "iscrowd": iscrowd[keep],
+ "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+ }
+
+ if keypoints:
+ keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+ # Apply the keep mask here to filter the relevant annotations
+ keypoints = keypoints[keep]
+ num_keypoints = keypoints.shape[0]
+ keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+ new_target["keypoints"] = keypoints
+
+ return new_target
+
+
+class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
+ r"""
+ Constructs a fast RTDetr image processor.
+
+ Args:
+ format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+ Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
+ overridden by the `do_resize` parameter in the `preprocess` method.
+ size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+ Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+ in the `preprocess` method. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+ Resampling filter to use if resizing the image.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+ `do_rescale` parameter in the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+ `preprocess` method.
+ do_normalize (`bool`, *optional*, defaults to `False`):
+ Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+ `preprocess` method.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+ Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+ channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+ Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+ for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ do_convert_annotations (`bool`, *optional*, defaults to `True`):
+ Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
+ bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+ Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+ do_pad (`bool`, *optional*, defaults to `False`):
+ Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+ method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+ If `pad_size` is provided, the image will be padded to the specified dimensions.
+ Otherwise, the image will be padded to the maximum height and width of the batch.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+
+ def __init__(
+ self,
+ format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+ do_resize: bool = True,
+ size: Dict[str, int] = None,
+ resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_normalize: bool = False,
+ image_mean: Union[float, List[float]] = None,
+ image_std: Union[float, List[float]] = None,
+ do_convert_annotations: bool = True,
+ do_pad: bool = False,
+ pad_size: Optional[Dict[str, int]] = None,
+ **kwargs,
+ ) -> None:
+ size = size if size is not None else {"height": 640, "width": 640}
+ size = get_size_dict(size, default_to_square=False)
+
+ if do_convert_annotations is None:
+ do_convert_annotations = do_normalize
+
+ BaseImageProcessorFast.__init__(**kwargs)
+ self.format = format
+ self.do_resize = do_resize
+ self.size = size
+ self.resample = resample
+ self.do_rescale = do_rescale
+ self.rescale_factor = rescale_factor
+ self.do_normalize = do_normalize
+ self.do_convert_annotations = do_convert_annotations
+ self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+ self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+ self.do_pad = do_pad
+ self.pad_size = pad_size
+
+ def prepare_annotation(
+ self,
+ image: torch.Tensor,
+ target: Dict,
+ format: Optional[AnnotationFormat] = None,
+ return_segmentation_masks: bool = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> Dict:
+ format = format if format is not None else self.format
+
+ if format == AnnotationFormat.COCO_DETECTION:
+ return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+ target = prepare_coco_detection_annotation(
+ image, target, return_segmentation_masks, input_data_format=input_data_format
+ )
+ else:
+ raise ValueError(f"Format {format} is not supported.")
+ return target
+
+ @filter_out_non_signature_kwargs(extra=["device"])
+ def preprocess(
+ self,
+ images: ImageInput,
+ annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+ return_segmentation_masks: bool = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ do_resize: Optional[bool] = None,
+ size: Optional[Dict[str, int]] = None,
+ resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None,
+ do_rescale: Optional[bool] = None,
+ rescale_factor: Optional[Union[int, float]] = None,
+ do_normalize: Optional[bool] = None,
+ do_convert_annotations: Optional[bool] = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ do_pad: Optional[bool] = None,
+ format: Optional[Union[str, AnnotationFormat]] = None,
+ return_tensors: Optional[Union[TensorType, str]] = None,
+ data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ pad_size: Optional[Dict[str, int]] = None,
+ **kwargs,
+ ) -> BatchFeature:
+ """
+ Preprocess an image or a batch of images so that it can be used by the model.
+
+ Args:
+ images (`ImageInput`):
+ Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+ from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+ annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+ List of annotations associated with the image or batch of images. If annotation is for object
+ detection, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+ dictionary. An image can have no annotations, in which case the list should be empty.
+ If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+ An image can have no segments, in which case the list should be empty.
+ - "file_name" (`str`): The file name of the image.
+ return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+ Whether to return segmentation masks.
+ masks_path (`str` or `pathlib.Path`, *optional*):
+ Path to the directory containing the segmentation masks.
+ do_resize (`bool`, *optional*, defaults to self.do_resize):
+ Whether to resize the image.
+ size (`Dict[str, int]`, *optional*, defaults to self.size):
+ Size of the image's `(height, width)` dimensions after resizing. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample):
+ Resampling filter to use when resizing the image.
+ do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+ Whether to rescale the image.
+ rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+ Rescale factor to use when rescaling the image.
+ do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+ Whether to normalize the image.
+ do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+ Whether to convert the annotations to the format expected by the model. Converts the bounding
+ boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+ and in relative coordinates.
+ image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+ Mean to use when normalizing the image.
+ image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+ Standard deviation to use when normalizing the image.
+ do_pad (`bool`, *optional*, defaults to self.do_pad):
+ Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+ the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+ dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+ format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+ Format of the annotations.
+ return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+ Type of tensors to return. If `None`, will return the list of images.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+ do_resize = self.do_resize if do_resize is None else do_resize
+ size = self.size if size is None else size
+ size = get_size_dict(size=size, default_to_square=True)
+ resample = self.resample if resample is None else resample
+ do_rescale = self.do_rescale if do_rescale is None else do_rescale
+ rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+ do_normalize = self.do_normalize if do_normalize is None else do_normalize
+ image_mean = self.image_mean if image_mean is None else image_mean
+ image_std = self.image_std if image_std is None else image_std
+ do_convert_annotations = (
+ self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+ )
+ do_pad = self.do_pad if do_pad is None else do_pad
+ pad_size = self.pad_size if pad_size is None else pad_size
+ format = self.format if format is None else format
+ return_tensors = "pt" if return_tensors is None else return_tensors
+ device = kwargs.pop("device", None)
+
+ # Make hashable for cache
+ size = SizeDict(**size)
+ image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
+ image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+
+ images = make_list_of_images(images)
+ image_type = get_image_type(images[0])
+
+ if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+ raise ValueError(f"Unsupported input image type {image_type}")
+
+ self._validate_input_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ return_tensors=return_tensors,
+ data_format=data_format,
+ )
+
+ if annotations is not None and isinstance(annotations, dict):
+ annotations = [annotations]
+
+ if annotations is not None and len(images) != len(annotations):
+ raise ValueError(
+ f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+ )
+
+ format = AnnotationFormat(format)
+ if annotations is not None:
+ validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+ data = {}
+ if image_type == ImageType.PIL:
+ images = [F.pil_to_tensor(image) for image in images]
+ elif image_type == ImageType.NUMPY:
+ # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
+ images = [torch.from_numpy(image).contiguous() for image in images]
+
+ if device is not None:
+ images = [image.to(device) for image in images]
+
+ # We assume that all images have the same channel dimension format.
+ if input_data_format is None:
+ input_data_format = infer_channel_dimension_format(images[0])
+ if input_data_format == ChannelDimension.LAST:
+ images = [image.permute(2, 0, 1).contiguous() for image in images]
+ input_data_format = ChannelDimension.FIRST
+
+ if do_rescale and do_normalize:
+ # fused rescale and normalize
+ new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor)
+ new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor)
+
+ processed_images = []
+ processed_annotations = []
+ pixel_masks = [] # Initialize pixel_masks here
+ for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+ # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+ if annotations is not None:
+ annotation = self.prepare_annotation(
+ image,
+ annotation,
+ format,
+ return_segmentation_masks=return_segmentation_masks,
+ masks_path=masks_path,
+ input_data_format=input_data_format,
+ )
+
+ if do_resize:
+ interpolation = (
+ pil_torch_interpolation_mapping[resample]
+ if isinstance(resample, (PILImageResampling, int))
+ else resample
+ )
+ resized_image = self.resize(image, size=size, interpolation=interpolation)
+ if annotations is not None:
+ annotation = self.resize_annotation(
+ annotation,
+ orig_size=image.size()[-2:],
+ target_size=resized_image.size()[-2:],
+ )
+ image = resized_image
+
+ if do_rescale and do_normalize:
+ # fused rescale and normalize
+ image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std)
+ elif do_rescale:
+ image = image * rescale_factor
+ elif do_normalize:
+ image = F.normalize(image, image_mean, image_std)
+
+ if do_convert_annotations and annotations is not None:
+ annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+
+ processed_images.append(image)
+ processed_annotations.append(annotation)
+ images = processed_images
+ annotations = processed_annotations if annotations is not None else None
+
+ if do_pad:
+ # depends on all resized image shapes so we need another loop
+ if pad_size is not None:
+ padded_size = (pad_size["height"], pad_size["width"])
+ else:
+ padded_size = get_max_height_width(images)
+
+ padded_images = []
+ padded_annotations = []
+ for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+ # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+ if padded_size == image.size()[-2:]:
+ padded_images.append(image)
+ pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+ padded_annotations.append(annotation)
+ continue
+ image, pixel_mask, annotation = self.pad(
+ image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+ )
+ padded_images.append(image)
+ padded_annotations.append(annotation)
+ pixel_masks.append(pixel_mask)
+ images = padded_images
+ annotations = padded_annotations if annotations is not None else None
+ data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+ data.update({"pixel_values": torch.stack(images, dim=0)})
+ encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+ if annotations is not None:
+ encoded_inputs["labels"] = [
+ BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+ ]
+ return encoded_inputs
+
+ def post_process_object_detection(
+ self,
+ outputs,
+ threshold: float = 0.5,
+ target_sizes: Union[TensorType, List[Tuple]] = None,
+ use_focal_loss: bool = True,
+ ):
+ """
+ Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+ bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+ Args:
+ outputs ([`DetrObjectDetectionOutput`]):
+ Raw outputs of the model.
+ threshold (`float`, *optional*, defaults to 0.5):
+ Score threshold to keep object detection predictions.
+ target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+ Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+ `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+ use_focal_loss (`bool` defaults to `True`):
+ Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
+ to compute the scores of each detection, otherwise, a softmax function is used.
+
+ Returns:
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+ in the batch as predicted by the model.
+ """
+ requires_backends(self, ["torch"])
+ out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+ # convert from relative cxcywh to absolute xyxy
+ boxes = center_to_corners_format(out_bbox)
+ if target_sizes is not None:
+ if len(out_logits) != len(target_sizes):
+ raise ValueError(
+ "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+ )
+ if isinstance(target_sizes, List):
+ img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
+ else:
+ img_h, img_w = target_sizes.unbind(1)
+ scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+ boxes = boxes * scale_fct[:, None, :]
+
+ num_top_queries = out_logits.shape[1]
+ num_classes = out_logits.shape[2]
+
+ if use_focal_loss:
+ scores = torch.nn.functional.sigmoid(out_logits)
+ scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
+ labels = index % num_classes
+ index = index // num_classes
+ boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+ else:
+ scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+ scores, labels = scores.max(dim=-1)
+ if scores.shape[1] > num_top_queries:
+ scores, index = torch.topk(scores, num_top_queries, dim=-1)
+ labels = torch.gather(labels, dim=1, index=index)
+ boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+ results = []
+ for score, label, box in zip(scores, labels, boxes):
+ results.append(
+ {
+ "scores": score[score > threshold],
+ "labels": label[score > threshold],
+ "boxes": box[score > threshold],
+ }
+ )
+
+ return results
+
+ def from_dict():
+ raise NotImplementedError("No need to override this method for RT-DETR yet.")
+
+ def post_process():
+ raise NotImplementedError("Post-processing is not implemented for RT-DETR yet.")
+
+ def post_process_segmentation():
+ raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.")
+
+ def post_process_instance():
+ raise NotImplementedError("Instance post-processing is not implemented for RT-DETR yet.")
+
+ def post_process_panoptic():
+ raise NotImplementedError("Panoptic post-processing is not implemented for RT-DETR yet.")
+
+ def post_process_instance_segmentation():
+ raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.")
+
+ def post_process_semantic_segmentation():
+ raise NotImplementedError("Semantic segmentation post-processing is not implemented for RT-DETR yet.")
+
+ def post_process_panoptic_segmentation():
+ raise NotImplementedError("Panoptic segmentation post-processing is not implemented for RT-DETR yet.")
+
+
+__all__ = ["RTDetrImageProcessorFast"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/swin/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/swin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dc5871b0375e8a5553e30dce1e5dde807a1b493
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/swin/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_swin import *
+ from .modeling_swin import *
+ from .modeling_tf_swin import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7de1c6cead3d0d99680a0e2b22567a9553b9162e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/configuration_swin.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/configuration_swin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e22832b1b3e0133e2e6bfffbbae2205ee755163f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/configuration_swin.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/modeling_swin.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/modeling_swin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9aeaa53eef965c73364aa11ab6f62450ddc5055f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/modeling_swin.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/modeling_tf_swin.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/modeling_tf_swin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34e61c3bd0e536e6b621c2abc408b41cbeecf193
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/swin/__pycache__/modeling_tf_swin.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/swin/configuration_swin.py b/.venv/lib/python3.11/site-packages/transformers/models/swin/configuration_swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..da6ba9871407a644b99435e69c6ec46e26fd8735
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/swin/configuration_swin.py
@@ -0,0 +1,179 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Swin Transformer model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class SwinConfig(BackboneConfigMixin, PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`SwinModel`]. It is used to instantiate a Swin
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the Swin
+ [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224)
+ architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 4):
+ The size (resolution) of each patch.
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ embed_dim (`int`, *optional*, defaults to 96):
+ Dimensionality of patch embedding.
+ depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
+ Depth of each layer in the Transformer encoder.
+ num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
+ Number of attention heads in each layer of the Transformer encoder.
+ window_size (`int`, *optional*, defaults to 7):
+ Size of windows.
+ mlp_ratio (`float`, *optional*, defaults to 4.0):
+ Ratio of MLP hidden dimensionality to embedding dimensionality.
+ qkv_bias (`bool`, *optional*, defaults to `True`):
+ Whether or not a learnable bias should be added to the queries, keys and values.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout probability for all fully connected layers in the embeddings and encoder.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
+ Stochastic depth rate.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+ `"selu"` and `"gelu_new"` are supported.
+ use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
+ Whether or not to add absolute position embeddings to the patch embeddings.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+ The epsilon used by the layer normalization layers.
+ encoder_stride (`int`, *optional*, defaults to 32):
+ Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+ out_features (`List[str]`, *optional*):
+ If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+ (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+ corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+ same order as defined in the `stage_names` attribute.
+ out_indices (`List[int]`, *optional*):
+ If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+ many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+ If unset and `out_features` is unset, will default to the last stage. Must be in the
+ same order as defined in the `stage_names` attribute.
+
+ Example:
+
+ ```python
+ >>> from transformers import SwinConfig, SwinModel
+
+ >>> # Initializing a Swin microsoft/swin-tiny-patch4-window7-224 style configuration
+ >>> configuration = SwinConfig()
+
+ >>> # Initializing a model (with random weights) from the microsoft/swin-tiny-patch4-window7-224 style configuration
+ >>> model = SwinModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "swin"
+
+ attribute_map = {
+ "num_attention_heads": "num_heads",
+ "num_hidden_layers": "num_layers",
+ }
+
+ def __init__(
+ self,
+ image_size=224,
+ patch_size=4,
+ num_channels=3,
+ embed_dim=96,
+ depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 24],
+ window_size=7,
+ mlp_ratio=4.0,
+ qkv_bias=True,
+ hidden_dropout_prob=0.0,
+ attention_probs_dropout_prob=0.0,
+ drop_path_rate=0.1,
+ hidden_act="gelu",
+ use_absolute_embeddings=False,
+ initializer_range=0.02,
+ layer_norm_eps=1e-5,
+ encoder_stride=32,
+ out_features=None,
+ out_indices=None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.embed_dim = embed_dim
+ self.depths = depths
+ self.num_layers = len(depths)
+ self.num_heads = num_heads
+ self.window_size = window_size
+ self.mlp_ratio = mlp_ratio
+ self.qkv_bias = qkv_bias
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.drop_path_rate = drop_path_rate
+ self.hidden_act = hidden_act
+ self.use_absolute_embeddings = use_absolute_embeddings
+ self.layer_norm_eps = layer_norm_eps
+ self.initializer_range = initializer_range
+ self.encoder_stride = encoder_stride
+ # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
+ # this indicates the channel dimension after the last stage of the model
+ self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
+ self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+ self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+ out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+ )
+
+
+class SwinOnnxConfig(OnnxConfig):
+ torch_onnx_minimum_version = version.parse("1.11")
+
+ @property
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
+ return OrderedDict(
+ [
+ ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+ ]
+ )
+
+ @property
+ def atol_for_validation(self) -> float:
+ return 1e-4
+
+
+__all__ = ["SwinConfig", "SwinOnnxConfig"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/swin/modeling_swin.py b/.venv/lib/python3.11/site-packages/transformers/models/swin/modeling_swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4262491366a0e111ca59f4fc86fde7f4c624fe3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/swin/modeling_swin.py
@@ -0,0 +1,1425 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Swin Transformer model."""
+
+import collections.abc
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...utils import (
+ ModelOutput,
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+ torch_int,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_swin import SwinConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "SwinConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "microsoft/swin-tiny-patch4-window7-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+# drop_path, SwinPatchEmbeddings, SwinPatchMerging and SwinDropPath are from the timm library.
+
+
+@dataclass
+class SwinEncoderOutput(ModelOutput):
+ """
+ Swin encoder's outputs, with potential hidden states and attentions.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+ shape `(batch_size, hidden_size, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+ include the spatial dimensions.
+ """
+
+ last_hidden_state: torch.FloatTensor = None
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+ reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class SwinModelOutput(ModelOutput):
+ """
+ Swin model's outputs that also contains a pooling of the last hidden states.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+ Average pooling of the last layer hidden-state.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+ shape `(batch_size, hidden_size, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+ include the spatial dimensions.
+ """
+
+ last_hidden_state: torch.FloatTensor = None
+ pooler_output: Optional[torch.FloatTensor] = None
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+ reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class SwinMaskedImageModelingOutput(ModelOutput):
+ """
+ Swin masked image model outputs.
+
+ Args:
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
+ Masked image modeling (MLM) loss.
+ reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Reconstructed pixel values.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+ shape `(batch_size, hidden_size, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+ include the spatial dimensions.
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ reconstruction: torch.FloatTensor = None
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+ reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+ @property
+ def logits(self):
+ warnings.warn(
+ "logits attribute is deprecated and will be removed in version 5 of Transformers."
+ " Please use the reconstruction attribute to retrieve the final output instead.",
+ FutureWarning,
+ )
+ return self.reconstruction
+
+
+@dataclass
+class SwinImageClassifierOutput(ModelOutput):
+ """
+ Swin outputs for image classification.
+
+ Args:
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Classification (or regression if config.num_labels==1) loss.
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+ Classification (or regression if config.num_labels==1) scores (before SoftMax).
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+ shape `(batch_size, hidden_size, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+ include the spatial dimensions.
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ logits: torch.FloatTensor = None
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+ reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+def window_partition(input_feature, window_size):
+ """
+ Partitions the given input into windows.
+ """
+ batch_size, height, width, num_channels = input_feature.shape
+ input_feature = input_feature.view(
+ batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
+ )
+ windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+ return windows
+
+
+def window_reverse(windows, window_size, height, width):
+ """
+ Merges windows to produce higher resolution features.
+ """
+ num_channels = windows.shape[-1]
+ windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
+ windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
+ return windows
+
+
+class SwinEmbeddings(nn.Module):
+ """
+ Construct the patch and position embeddings. Optionally, also the mask token.
+ """
+
+ def __init__(self, config, use_mask_token=False):
+ super().__init__()
+
+ self.patch_embeddings = SwinPatchEmbeddings(config)
+ num_patches = self.patch_embeddings.num_patches
+ self.patch_grid = self.patch_embeddings.grid_size
+ self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
+
+ if config.use_absolute_embeddings:
+ self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
+ else:
+ self.position_embeddings = None
+
+ self.norm = nn.LayerNorm(config.embed_dim)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.patch_size = config.patch_size
+ self.config = config
+
+ # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ """
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+ images. This method is also adapted to support torch.jit tracing.
+
+ Adapted from:
+ - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+ - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+ """
+
+ num_patches = embeddings.shape[1] - 1
+ num_positions = self.position_embeddings.shape[1] - 1
+
+ # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+ if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+ return self.position_embeddings
+
+ class_pos_embed = self.position_embeddings[:, :1]
+ patch_pos_embed = self.position_embeddings[:, 1:]
+
+ dim = embeddings.shape[-1]
+
+ new_height = height // self.patch_size
+ new_width = width // self.patch_size
+
+ sqrt_num_positions = torch_int(num_positions**0.5)
+ patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed,
+ size=(new_height, new_width),
+ mode="bicubic",
+ align_corners=False,
+ )
+
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+ return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor],
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
+ interpolate_pos_encoding: bool = False,
+ ) -> Tuple[torch.Tensor]:
+ _, num_channels, height, width = pixel_values.shape
+ embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+ embeddings = self.norm(embeddings)
+ batch_size, seq_len, _ = embeddings.size()
+
+ if bool_masked_pos is not None:
+ mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+ # replace the masked visual tokens by mask_tokens
+ mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+ embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+ if self.position_embeddings is not None:
+ if interpolate_pos_encoding:
+ embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+ else:
+ embeddings = embeddings + self.position_embeddings
+
+ embeddings = self.dropout(embeddings)
+
+ return embeddings, output_dimensions
+
+
+class SwinPatchEmbeddings(nn.Module):
+ """
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+ `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+ Transformer.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ image_size, patch_size = config.image_size, config.patch_size
+ num_channels, hidden_size = config.num_channels, config.embed_dim
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.num_patches = num_patches
+ self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+
+ self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+ def maybe_pad(self, pixel_values, height, width):
+ if width % self.patch_size[1] != 0:
+ pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
+ pixel_values = nn.functional.pad(pixel_values, pad_values)
+ if height % self.patch_size[0] != 0:
+ pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
+ pixel_values = nn.functional.pad(pixel_values, pad_values)
+ return pixel_values
+
+ def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
+ _, num_channels, height, width = pixel_values.shape
+ # pad the input to be divisible by self.patch_size, if needed
+ pixel_values = self.maybe_pad(pixel_values, height, width)
+ embeddings = self.projection(pixel_values)
+ _, _, height, width = embeddings.shape
+ output_dimensions = (height, width)
+ embeddings = embeddings.flatten(2).transpose(1, 2)
+
+ return embeddings, output_dimensions
+
+
+class SwinPatchMerging(nn.Module):
+ """
+ Patch Merging Layer.
+
+ Args:
+ input_resolution (`Tuple[int]`):
+ Resolution of input feature.
+ dim (`int`):
+ Number of input channels.
+ norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+ Normalization layer class.
+ """
+
+ def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+ super().__init__()
+ self.input_resolution = input_resolution
+ self.dim = dim
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+ self.norm = norm_layer(4 * dim)
+
+ def maybe_pad(self, input_feature, height, width):
+ should_pad = (height % 2 == 1) or (width % 2 == 1)
+ if should_pad:
+ pad_values = (0, 0, 0, width % 2, 0, height % 2)
+ input_feature = nn.functional.pad(input_feature, pad_values)
+
+ return input_feature
+
+ def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
+ height, width = input_dimensions
+ # `dim` is height * width
+ batch_size, dim, num_channels = input_feature.shape
+
+ input_feature = input_feature.view(batch_size, height, width, num_channels)
+ # pad input to be disible by width and height, if needed
+ input_feature = self.maybe_pad(input_feature, height, width)
+ # [batch_size, height/2, width/2, num_channels]
+ input_feature_0 = input_feature[:, 0::2, 0::2, :]
+ # [batch_size, height/2, width/2, num_channels]
+ input_feature_1 = input_feature[:, 1::2, 0::2, :]
+ # [batch_size, height/2, width/2, num_channels]
+ input_feature_2 = input_feature[:, 0::2, 1::2, :]
+ # [batch_size, height/2, width/2, num_channels]
+ input_feature_3 = input_feature[:, 1::2, 1::2, :]
+ # batch_size height/2 width/2 4*num_channels
+ input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+ input_feature = input_feature.view(batch_size, -1, 4 * num_channels) # batch_size height/2*width/2 4*C
+
+ input_feature = self.norm(input_feature)
+ input_feature = self.reduction(input_feature)
+
+ return input_feature
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+ """
+ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+ Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+ however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+ layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+ argument.
+ """
+ if drop_prob == 0.0 or not training:
+ return input
+ keep_prob = 1 - drop_prob
+ shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+ random_tensor.floor_() # binarize
+ output = input.div(keep_prob) * random_tensor
+ return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Swin
+class SwinDropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob: Optional[float] = None) -> None:
+ super().__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ return drop_path(hidden_states, self.drop_prob, self.training)
+
+ def extra_repr(self) -> str:
+ return "p={}".format(self.drop_prob)
+
+
+class SwinSelfAttention(nn.Module):
+ def __init__(self, config, dim, num_heads, window_size):
+ super().__init__()
+ if dim % num_heads != 0:
+ raise ValueError(
+ f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+ )
+
+ self.num_attention_heads = num_heads
+ self.attention_head_size = int(dim / num_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.window_size = (
+ window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
+ )
+
+ self.relative_position_bias_table = nn.Parameter(
+ torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
+ )
+
+ # get pair-wise relative position index for each token inside the window
+ coords_h = torch.arange(self.window_size[0])
+ coords_w = torch.arange(self.window_size[1])
+ coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
+ coords_flatten = torch.flatten(coords, 1)
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+ relative_coords[:, :, 0] += self.window_size[0] - 1
+ relative_coords[:, :, 1] += self.window_size[1] - 1
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+ relative_position_index = relative_coords.sum(-1)
+ self.register_buffer("relative_position_index", relative_position_index)
+
+ self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+ self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+ self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+ def transpose_for_scores(self, x):
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+ x = x.view(new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.Tensor]:
+ batch_size, dim, num_channels = hidden_states.shape
+ mixed_query_layer = self.query(hidden_states)
+
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
+ relative_position_bias = relative_position_bias.view(
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+ )
+
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+ attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
+
+ if attention_mask is not None:
+ # Apply the attention mask is (precomputed for all layers in SwinModel forward() function)
+ mask_shape = attention_mask.shape[0]
+ attention_scores = attention_scores.view(
+ batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
+ )
+ attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
+ attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(attention_probs)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = attention_probs * head_mask
+
+ context_layer = torch.matmul(attention_probs, value_layer)
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(new_context_layer_shape)
+
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+ return outputs
+
+
+class SwinSelfOutput(nn.Module):
+ def __init__(self, config, dim):
+ super().__init__()
+ self.dense = nn.Linear(dim, dim)
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ return hidden_states
+
+
+class SwinAttention(nn.Module):
+ def __init__(self, config, dim, num_heads, window_size):
+ super().__init__()
+ self.self = SwinSelfAttention(config, dim, num_heads, window_size)
+ self.output = SwinSelfOutput(config, dim)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+ )
+
+ # Prune linear layers
+ self.self.query = prune_linear_layer(self.self.query, index)
+ self.self.key = prune_linear_layer(self.self.key, index)
+ self.self.value = prune_linear_layer(self.self.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.Tensor]:
+ self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
+ attention_output = self.output(self_outputs[0], hidden_states)
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+ return outputs
+
+
+class SwinIntermediate(nn.Module):
+ def __init__(self, config, dim):
+ super().__init__()
+ self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+class SwinOutput(nn.Module):
+ def __init__(self, config, dim):
+ super().__init__()
+ self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ return hidden_states
+
+
+class SwinLayer(nn.Module):
+ def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
+ super().__init__()
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ self.shift_size = shift_size
+ self.window_size = config.window_size
+ self.input_resolution = input_resolution
+ self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+ self.attention = SwinAttention(config, dim, num_heads, window_size=self.window_size)
+ self.drop_path = SwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+ self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+ self.intermediate = SwinIntermediate(config, dim)
+ self.output = SwinOutput(config, dim)
+
+ def set_shift_and_window_size(self, input_resolution):
+ if min(input_resolution) <= self.window_size:
+ # if window size is larger than input resolution, we don't partition windows
+ self.shift_size = torch_int(0)
+ self.window_size = (
+ torch.min(torch.tensor(input_resolution)) if torch.jit.is_tracing() else min(input_resolution)
+ )
+
+ def get_attn_mask(self, height, width, dtype, device):
+ if self.shift_size > 0:
+ # calculate attention mask for SW-MSA
+ img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device=device)
+ height_slices = (
+ slice(0, -self.window_size),
+ slice(-self.window_size, -self.shift_size),
+ slice(-self.shift_size, None),
+ )
+ width_slices = (
+ slice(0, -self.window_size),
+ slice(-self.window_size, -self.shift_size),
+ slice(-self.shift_size, None),
+ )
+ count = 0
+ for height_slice in height_slices:
+ for width_slice in width_slices:
+ img_mask[:, height_slice, width_slice, :] = count
+ count += 1
+
+ mask_windows = window_partition(img_mask, self.window_size)
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+ else:
+ attn_mask = None
+ return attn_mask
+
+ def maybe_pad(self, hidden_states, height, width):
+ pad_right = (self.window_size - width % self.window_size) % self.window_size
+ pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+ pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
+ hidden_states = nn.functional.pad(hidden_states, pad_values)
+ return hidden_states, pad_values
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ input_dimensions: Tuple[int, int],
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = False,
+ always_partition: Optional[bool] = False,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ if not always_partition:
+ self.set_shift_and_window_size(input_dimensions)
+ else:
+ pass
+ height, width = input_dimensions
+ batch_size, _, channels = hidden_states.size()
+ shortcut = hidden_states
+
+ hidden_states = self.layernorm_before(hidden_states)
+
+ hidden_states = hidden_states.view(batch_size, height, width, channels)
+
+ # pad hidden_states to multiples of window size
+ hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+ _, height_pad, width_pad, _ = hidden_states.shape
+ # cyclic shift
+ if self.shift_size > 0:
+ shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+ else:
+ shifted_hidden_states = hidden_states
+
+ # partition windows
+ hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+ hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
+ attn_mask = self.get_attn_mask(
+ height_pad, width_pad, dtype=hidden_states.dtype, device=hidden_states_windows.device
+ )
+
+ attention_outputs = self.attention(
+ hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
+ )
+
+ attention_output = attention_outputs[0]
+
+ attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
+ shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
+
+ # reverse cyclic shift
+ if self.shift_size > 0:
+ attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+ else:
+ attention_windows = shifted_windows
+
+ was_padded = pad_values[3] > 0 or pad_values[5] > 0
+ if was_padded:
+ attention_windows = attention_windows[:, :height, :width, :].contiguous()
+
+ attention_windows = attention_windows.view(batch_size, height * width, channels)
+
+ hidden_states = shortcut + self.drop_path(attention_windows)
+
+ layer_output = self.layernorm_after(hidden_states)
+ layer_output = self.intermediate(layer_output)
+ layer_output = hidden_states + self.output(layer_output)
+
+ layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+ return layer_outputs
+
+
+class SwinStage(nn.Module):
+ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+ super().__init__()
+ self.config = config
+ self.dim = dim
+ self.blocks = nn.ModuleList(
+ [
+ SwinLayer(
+ config=config,
+ dim=dim,
+ input_resolution=input_resolution,
+ num_heads=num_heads,
+ drop_path_rate=drop_path[i],
+ shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+ )
+ for i in range(depth)
+ ]
+ )
+
+ # patch merging layer
+ if downsample is not None:
+ self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+ else:
+ self.downsample = None
+
+ self.pointing = False
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ input_dimensions: Tuple[int, int],
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = False,
+ always_partition: Optional[bool] = False,
+ ) -> Tuple[torch.Tensor]:
+ height, width = input_dimensions
+ for i, layer_module in enumerate(self.blocks):
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+
+ layer_outputs = layer_module(
+ hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+ )
+
+ hidden_states = layer_outputs[0]
+
+ hidden_states_before_downsampling = hidden_states
+ if self.downsample is not None:
+ height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+ output_dimensions = (height, width, height_downsampled, width_downsampled)
+ hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
+ else:
+ output_dimensions = (height, width, height, width)
+
+ stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
+
+ if output_attentions:
+ stage_outputs += layer_outputs[1:]
+ return stage_outputs
+
+
+class SwinEncoder(nn.Module):
+ def __init__(self, config, grid_size):
+ super().__init__()
+ self.num_layers = len(config.depths)
+ self.config = config
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+ self.layers = nn.ModuleList(
+ [
+ SwinStage(
+ config=config,
+ dim=int(config.embed_dim * 2**i_layer),
+ input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+ depth=config.depths[i_layer],
+ num_heads=config.num_heads[i_layer],
+ drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+ downsample=SwinPatchMerging if (i_layer < self.num_layers - 1) else None,
+ )
+ for i_layer in range(self.num_layers)
+ ]
+ )
+
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ input_dimensions: Tuple[int, int],
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = False,
+ output_hidden_states: Optional[bool] = False,
+ output_hidden_states_before_downsampling: Optional[bool] = False,
+ always_partition: Optional[bool] = False,
+ return_dict: Optional[bool] = True,
+ ) -> Union[Tuple, SwinEncoderOutput]:
+ all_hidden_states = () if output_hidden_states else None
+ all_reshaped_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ if output_hidden_states:
+ batch_size, _, hidden_size = hidden_states.shape
+ # rearrange b (h w) c -> b c h w
+ reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+ reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+ all_hidden_states += (hidden_states,)
+ all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+ for i, layer_module in enumerate(self.layers):
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ hidden_states,
+ input_dimensions,
+ layer_head_mask,
+ output_attentions,
+ always_partition,
+ )
+ else:
+ layer_outputs = layer_module(
+ hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+ )
+
+ hidden_states = layer_outputs[0]
+ hidden_states_before_downsampling = layer_outputs[1]
+ output_dimensions = layer_outputs[2]
+
+ input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+ if output_hidden_states and output_hidden_states_before_downsampling:
+ batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+ # rearrange b (h w) c -> b c h w
+ # here we use the original (not downsampled) height and width
+ reshaped_hidden_state = hidden_states_before_downsampling.view(
+ batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
+ )
+ reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+ all_hidden_states += (hidden_states_before_downsampling,)
+ all_reshaped_hidden_states += (reshaped_hidden_state,)
+ elif output_hidden_states and not output_hidden_states_before_downsampling:
+ batch_size, _, hidden_size = hidden_states.shape
+ # rearrange b (h w) c -> b c h w
+ reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+ reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+ all_hidden_states += (hidden_states,)
+ all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+ if output_attentions:
+ all_self_attentions += layer_outputs[3:]
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+ return SwinEncoderOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ reshaped_hidden_states=all_reshaped_hidden_states,
+ )
+
+
+class SwinPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = SwinConfig
+ base_model_prefix = "swin"
+ main_input_name = "pixel_values"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["SwinStage"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+
+SWIN_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`SwinConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SWIN_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+ for details.
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+ Whether to interpolate the pre-trained position encodings.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare Swin Model transformer outputting raw hidden-states without any specific head on top.",
+ SWIN_START_DOCSTRING,
+ """
+ add_pooling_layer (`bool`, *optional*, defaults to `True`):
+ Whether or not to apply pooling layer.
+ use_mask_token (`bool`, *optional*, defaults to `False`):
+ Whether or not to create and apply mask tokens in the embedding layer.
+ """,
+)
+class SwinModel(SwinPreTrainedModel):
+ def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
+ super().__init__(config)
+ self.config = config
+ self.num_layers = len(config.depths)
+ self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
+
+ self.embeddings = SwinEmbeddings(config, use_mask_token=use_mask_token)
+ self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
+
+ self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+ self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embeddings.patch_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=SwinModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="vision",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SwinModelOutput]:
+ r"""
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask, len(self.config.depths))
+
+ embedding_output, input_dimensions = self.embeddings(
+ pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+ )
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ input_dimensions,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = encoder_outputs[0]
+ sequence_output = self.layernorm(sequence_output)
+
+ pooled_output = None
+ if self.pooler is not None:
+ pooled_output = self.pooler(sequence_output.transpose(1, 2))
+ pooled_output = torch.flatten(pooled_output, 1)
+
+ if not return_dict:
+ output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+ return output
+
+ return SwinModelOutput(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+ )
+
+
+@add_start_docstrings(
+ """Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).
+
+
+
+ Note that we provide a script to pre-train this model on custom data in our [examples
+ directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+
+ """,
+ SWIN_START_DOCSTRING,
+)
+class SwinForMaskedImageModeling(SwinPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.swin = SwinModel(config, add_pooling_layer=False, use_mask_token=True)
+
+ num_features = int(config.embed_dim * 2 ** (config.num_layers - 1))
+ self.decoder = nn.Sequential(
+ nn.Conv2d(
+ in_channels=num_features, out_channels=config.encoder_stride**2 * config.num_channels, kernel_size=1
+ ),
+ nn.PixelShuffle(config.encoder_stride),
+ )
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=SwinMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SwinMaskedImageModelingOutput]:
+ r"""
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+ Returns:
+
+ Examples:
+ ```python
+ >>> from transformers import AutoImageProcessor, SwinForMaskedImageModeling
+ >>> import torch
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-simmim-window6-192")
+ >>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192")
+
+ >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
+ >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+ >>> # create random boolean mask of shape (batch_size, num_patches)
+ >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
+
+ >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+ >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
+ >>> list(reconstructed_pixel_values.shape)
+ [1, 3, 192, 192]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.swin(
+ pixel_values,
+ bool_masked_pos=bool_masked_pos,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+ # Reshape to (batch_size, num_channels, height, width)
+ sequence_output = sequence_output.transpose(1, 2)
+ batch_size, num_channels, sequence_length = sequence_output.shape
+ height = width = math.floor(sequence_length**0.5)
+ sequence_output = sequence_output.reshape(batch_size, num_channels, height, width)
+
+ # Reconstruct pixel values
+ reconstructed_pixel_values = self.decoder(sequence_output)
+
+ masked_im_loss = None
+ if bool_masked_pos is not None:
+ size = self.config.image_size // self.config.patch_size
+ bool_masked_pos = bool_masked_pos.reshape(-1, size, size)
+ mask = (
+ bool_masked_pos.repeat_interleave(self.config.patch_size, 1)
+ .repeat_interleave(self.config.patch_size, 2)
+ .unsqueeze(1)
+ .contiguous()
+ )
+ reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none")
+ masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels
+
+ if not return_dict:
+ output = (reconstructed_pixel_values,) + outputs[2:]
+ return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+
+ return SwinMaskedImageModelingOutput(
+ loss=masked_im_loss,
+ reconstruction=reconstructed_pixel_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ reshaped_hidden_states=outputs.reshaped_hidden_states,
+ )
+
+
+@add_start_docstrings(
+ """
+ Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+ the [CLS] token) e.g. for ImageNet.
+
+
+
+ Note that it's possible to fine-tune Swin on higher resolution images than the ones it has been trained on, by
+ setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+ position embeddings to the higher resolution.
+
+
+ """,
+ SWIN_START_DOCSTRING,
+)
+class SwinForImageClassification(SwinPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.swin = SwinModel(config)
+
+ # Classifier head
+ self.classifier = (
+ nn.Linear(self.swin.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+ )
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
+ output_type=SwinImageClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SwinImageClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.swin(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ return_dict=return_dict,
+ )
+
+ pooled_output = outputs[1]
+
+ logits = self.classifier(pooled_output)
+
+ loss = None
+ if labels is not None:
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SwinImageClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ reshaped_hidden_states=outputs.reshaped_hidden_states,
+ )
+
+
+@add_start_docstrings(
+ """
+ Swin backbone, to be used with frameworks like DETR and MaskFormer.
+ """,
+ SWIN_START_DOCSTRING,
+)
+class SwinBackbone(SwinPreTrainedModel, BackboneMixin):
+ def __init__(self, config: SwinConfig):
+ super().__init__(config)
+ super()._init_backbone(config)
+
+ self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
+ self.embeddings = SwinEmbeddings(config)
+ self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
+
+ # Add layer norms to hidden states of out_features
+ hidden_states_norms = {}
+ for stage, num_channels in zip(self._out_features, self.channels):
+ hidden_states_norms[stage] = nn.LayerNorm(num_channels)
+ self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embeddings.patch_embeddings
+
+ def forward(
+ self,
+ pixel_values: torch.Tensor,
+ output_hidden_states: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> BackboneOutput:
+ """
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, AutoBackbone
+ >>> import torch
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
+ >>> model = AutoBackbone.from_pretrained(
+ ... "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
+ ... )
+
+ >>> inputs = processor(image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> feature_maps = outputs.feature_maps
+ >>> list(feature_maps[-1].shape)
+ [1, 768, 7, 7]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+ embedding_output, input_dimensions = self.embeddings(pixel_values)
+
+ outputs = self.encoder(
+ embedding_output,
+ input_dimensions,
+ head_mask=None,
+ output_attentions=output_attentions,
+ output_hidden_states=True,
+ output_hidden_states_before_downsampling=True,
+ always_partition=True,
+ return_dict=True,
+ )
+
+ hidden_states = outputs.reshaped_hidden_states
+
+ feature_maps = ()
+ for stage, hidden_state in zip(self.stage_names, hidden_states):
+ if stage in self.out_features:
+ batch_size, num_channels, height, width = hidden_state.shape
+ hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+ hidden_state = hidden_state.view(batch_size, height * width, num_channels)
+ hidden_state = self.hidden_states_norms[stage](hidden_state)
+ hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+ hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+ feature_maps += (hidden_state,)
+
+ if not return_dict:
+ output = (feature_maps,)
+ if output_hidden_states:
+ output += (outputs.hidden_states,)
+ return output
+
+ return BackboneOutput(
+ feature_maps=feature_maps,
+ hidden_states=outputs.hidden_states if output_hidden_states else None,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = [
+ "SwinForImageClassification",
+ "SwinForMaskedImageModeling",
+ "SwinModel",
+ "SwinPreTrainedModel",
+ "SwinBackbone",
+]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/swin/modeling_tf_swin.py b/.venv/lib/python3.11/site-packages/transformers/models/swin/modeling_tf_swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..865444f081a64fd137b53367e8c1c1c8f5381e90
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/swin/modeling_tf_swin.py
@@ -0,0 +1,1638 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 Swin Transformer model."""
+
+from __future__ import annotations
+
+import collections.abc
+import math
+import warnings
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...activations_tf import ACT2FN
+from ...modeling_tf_utils import (
+ TFPreTrainedModel,
+ TFSequenceClassificationLoss,
+ get_initializer,
+ keras,
+ keras_serializable,
+ unpack_inputs,
+)
+from ...tf_utils import shape_list
+from ...utils import (
+ ModelOutput,
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_swin import SwinConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "SwinConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "microsoft/swin-tiny-patch4-window7-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+# drop_path, TFSwinPatchEmbeddings, TFSwinPatchMerging and TFSwinDropPath are tensorflow
+# implementations of PyTorch functionalities in the timm library.
+
+
+@dataclass
+class TFSwinEncoderOutput(ModelOutput):
+ """
+ Swin encoder's outputs, with potential hidden states and attentions.
+
+ Args:
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
+ `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
+ `(batch_size, hidden_size, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+ include the spatial dimensions.
+ """
+
+ last_hidden_state: tf.Tensor = None
+ hidden_states: Tuple[tf.Tensor, ...] | None = None
+ attentions: Tuple[tf.Tensor, ...] | None = None
+ reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFSwinModelOutput(ModelOutput):
+ """
+ Swin model's outputs that also contains a pooling of the last hidden states.
+
+ Args:
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+ Average pooling of the last layer hidden-state.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
+ `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
+ `(batch_size, hidden_size, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+ include the spatial dimensions.
+ """
+
+ last_hidden_state: tf.Tensor = None
+ pooler_output: tf.Tensor | None = None
+ hidden_states: Tuple[tf.Tensor, ...] | None = None
+ attentions: Tuple[tf.Tensor, ...] | None = None
+ reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFSwinMaskedImageModelingOutput(ModelOutput):
+ """
+ Swin masked image model outputs.
+
+ Args:
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
+ Masked image modeling (MLM) loss.
+ reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+ Reconstructed pixel values.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
+ `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
+ `(batch_size, hidden_size, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+ include the spatial dimensions.
+ """
+
+ loss: tf.Tensor | None = None
+ reconstruction: tf.Tensor = None
+ hidden_states: Tuple[tf.Tensor, ...] | None = None
+ attentions: Tuple[tf.Tensor, ...] | None = None
+ reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
+
+ @property
+ def logits(self):
+ warnings.warn(
+ "logits attribute is deprecated and will be removed in version 5 of Transformers."
+ " Please use the reconstruction attribute to retrieve the final output instead.",
+ FutureWarning,
+ )
+ return self.reconstruction
+
+
+@dataclass
+class TFSwinImageClassifierOutput(ModelOutput):
+ """
+ Swin outputs for image classification.
+
+ Args:
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Classification (or regression if config.num_labels==1) loss.
+ logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+ Classification (or regression if config.num_labels==1) scores (before SoftMax).
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
+ `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
+ `(batch_size, hidden_size, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+ include the spatial dimensions.
+ """
+
+ loss: tf.Tensor | None = None
+ logits: tf.Tensor = None
+ hidden_states: Tuple[tf.Tensor, ...] | None = None
+ attentions: Tuple[tf.Tensor, ...] | None = None
+ reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
+
+
+def window_partition(input_feature: tf.Tensor, window_size: int) -> tf.Tensor:
+ """
+ Partitions the given input into windows.
+ """
+ batch_size, height, width, num_channels = shape_list(input_feature)
+ input_feature = tf.reshape(
+ input_feature,
+ (batch_size, height // window_size, window_size, width // window_size, window_size, num_channels),
+ )
+ windows = tf.transpose(input_feature, (0, 1, 3, 2, 4, 5))
+ windows = tf.reshape(windows, (-1, window_size, window_size, num_channels))
+ return windows
+
+
+def window_reverse(windows: tf.Tensor, window_size: int, height: int, width: int) -> tf.Tensor:
+ """
+ Merges windows to produce higher resolution features.
+ """
+ x = tf.shape(windows)[0]
+ y = tf.cast(height * width / (window_size * window_size), tf.int32)
+ batch_size = tf.math.floordiv(x, y)
+ windows = tf.reshape(
+ windows, (batch_size, height // window_size, width // window_size, window_size, window_size, -1)
+ )
+ windows = tf.transpose(windows, (0, 1, 3, 2, 4, 5))
+ windows = tf.reshape(windows, (batch_size, height, width, -1))
+ return windows
+
+
+def drop_path(
+ input: tf.Tensor, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+) -> tf.Tensor:
+ """
+ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ if drop_prob == 0.0 or not training:
+ return input
+ keep_prob = 1 - drop_prob
+ input_shape = shape_list(input)
+ ndim = len(input_shape)
+ shape = [input_shape[0]] + [1] * (ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = tf.random.uniform(shape)
+ random_tensor = tf.where(random_tensor <= keep_prob, 1.0, 0.0)
+ if keep_prob > 0.0 and scale_by_keep:
+ random_tensor /= keep_prob
+ return input * random_tensor
+
+
+class TFSwinEmbeddings(keras.layers.Layer):
+ """
+ Construct the patch and position embeddings. Optionally, also the mask token.
+ """
+
+ def __init__(self, config: SwinConfig, use_mask_token: bool = False, **kwargs) -> None:
+ super().__init__(**kwargs)
+ self.patch_embeddings = TFSwinPatchEmbeddings(config, name="patch_embeddings")
+ self.num_patches = self.patch_embeddings.num_patches
+ self.patch_grid = self.patch_embeddings.grid_size
+ self.embed_dim = config.embed_dim
+ self.use_mask_token = use_mask_token
+ self.use_absolute_embeddings = config.use_absolute_embeddings
+
+ self.norm = keras.layers.LayerNormalization(name="norm", epsilon=1e-5)
+ self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+ self.config = config
+
+ def build(self, input_shape: tf.TensorShape) -> None:
+ if self.use_mask_token:
+ self.mask_token = self.add_weight(shape=(1, 1, self.embed_dim), initializer="zeros", name="mask_token")
+ else:
+ self.mask_token = None
+
+ if self.use_absolute_embeddings:
+ self.position_embeddings = self.add_weight(
+ (1, self.num_patches + 1, self.embed_dim), initializer="zeros", name="positional_embeddings"
+ )
+ else:
+ self.position_embeddings = None
+
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "patch_embeddings", None) is not None:
+ with tf.name_scope(self.patch_embeddings.name):
+ self.patch_embeddings.build(None)
+ if getattr(self, "norm", None) is not None:
+ with tf.name_scope(self.norm.name):
+ self.norm.build([None, None, self.config.embed_dim])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+ def call(
+ self, pixel_values: tf.Tensor, bool_masked_pos: bool = None, training: bool = False
+ ) -> Tuple[tf.Tensor, Tuple[int, int]]:
+ embeddings, output_dimensions = self.patch_embeddings(pixel_values, training=training)
+ embeddings = self.norm(embeddings, training=training)
+ batch_size, seq_len, _ = shape_list(embeddings)
+
+ if bool_masked_pos is not None:
+ mask_tokens = tf.repeat(self.mask_token, batch_size, 0)
+ mask_tokens = tf.repeat(mask_tokens, seq_len, 1)
+ # replace the masked visual tokens by mask_tokens
+ mask = tf.expand_dims(bool_masked_pos, -1)
+ mask = tf.cast(mask, mask_tokens.dtype)
+
+ embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+ if self.position_embeddings is not None:
+ embeddings = embeddings + self.position_embeddings
+
+ embeddings = self.dropout(embeddings, training=training)
+
+ return embeddings, output_dimensions
+
+
+class TFSwinPatchEmbeddings(keras.layers.Layer):
+ """
+ Image to Patch Embedding.
+ """
+
+ def __init__(self, config, **kwargs):
+ super().__init__(**kwargs)
+ image_size, patch_size = config.image_size, config.patch_size
+ num_channels, hidden_size = config.num_channels, config.embed_dim
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.num_patches = num_patches
+ self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+
+ self.projection = keras.layers.Conv2D(
+ filters=hidden_size,
+ kernel_size=self.patch_size,
+ strides=self.patch_size,
+ padding="valid",
+ name="projection",
+ )
+
+ def maybe_pad(self, pixel_values: tf.Tensor, height: int, width: int) -> tf.Tensor:
+ if width % self.patch_size[1] != 0:
+ pad_values = ((0, 0), (0, 0), (0, 0), (0, self.patch_size[1] - width % self.patch_size[1]))
+ pixel_values = tf.pad(pixel_values, pad_values)
+ if height % self.patch_size[0] != 0:
+ pad_values = ((0, 0), (0, 0), (0, self.patch_size[0] - height % self.patch_size[0]), (0, 0))
+ pixel_values = tf.pad(pixel_values, pad_values)
+ return pixel_values
+
+ def call(self, pixel_values: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor, Tuple[int, int]]:
+ _, num_channels, height, width = shape_list(pixel_values)
+ if tf.executing_eagerly() and num_channels != self.num_channels:
+ raise ValueError(
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+ )
+ # pad the input to be divisible by self.patch_size, if needed
+ pixel_values = self.maybe_pad(pixel_values, height, width)
+
+ # B,C,H,W -> B,H,W,C
+ pixel_values = tf.transpose(pixel_values, (0, 2, 3, 1))
+
+ embeddings = self.projection(pixel_values, training=training)
+
+ # B,H,W,C -> B,C,H,W
+ embeddings = tf.transpose(embeddings, (0, 3, 1, 2))
+
+ batch_size, channels, height, width = shape_list(embeddings)
+ output_dimensions = (height, width)
+
+ embeddings = tf.reshape(embeddings, (batch_size, channels, -1))
+ embeddings = tf.transpose(embeddings, (0, 2, 1))
+ return embeddings, output_dimensions
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "projection", None) is not None:
+ with tf.name_scope(self.projection.name):
+ self.projection.build([None, None, None, self.num_channels])
+
+
+class TFSwinPatchMerging(keras.layers.Layer):
+ """
+ Patch Merging Layer.
+
+ Args:
+ input_resolution (`Tuple[int]`):
+ Resolution of input feature.
+ dim (`int`):
+ Number of input channels.
+ norm_layer (`keras.layer.Layer`, *optional*, defaults to `keras.layers.LayerNormalization`):
+ Normalization layer class.
+ """
+
+ def __init__(
+ self, input_resolution: Tuple[int, int], dim: int, norm_layer: Optional[Callable] = None, **kwargs
+ ) -> None:
+ super().__init__(**kwargs)
+ self.input_resolution = input_resolution
+ self.dim = dim
+ self.reduction = keras.layers.Dense(2 * dim, use_bias=False, name="reduction")
+ if norm_layer is None:
+ # Use same default epsilon as PyTorch
+ self.norm = keras.layers.LayerNormalization(epsilon=1e-5, name="norm")
+ else:
+ self.norm = norm_layer(name="norm")
+
+ def maybe_pad(self, input_feature: tf.Tensor, height: int, width: int) -> tf.Tensor:
+ should_pad = (height % 2 == 1) or (width % 2 == 1)
+ if should_pad:
+ pad_values = ((0, 0), (0, height % 2), (0, width % 2), (0, 0))
+ input_feature = tf.pad(input_feature, pad_values)
+
+ return input_feature
+
+ def call(self, input_feature: tf.Tensor, input_dimensions: Tuple[int, int], training: bool = False) -> tf.Tensor:
+ height, width = input_dimensions
+ # `dim` is height * width
+ batch_size, _, num_channels = shape_list(input_feature)
+
+ input_feature = tf.reshape(input_feature, (batch_size, height, width, num_channels))
+ # pad input to be disible by width and height, if needed
+ input_feature = self.maybe_pad(input_feature, height, width)
+ # [batch_size, height/2, width/2, num_channels]
+ input_feature_0 = input_feature[:, 0::2, 0::2, :]
+ # [batch_size, height/2, width/2, num_channels]
+ input_feature_1 = input_feature[:, 1::2, 0::2, :]
+ # [batch_size, height/2, width/2, num_channels]
+ input_feature_2 = input_feature[:, 0::2, 1::2, :]
+ # [batch_size, height/2, width/2, num_channels]
+ input_feature_3 = input_feature[:, 1::2, 1::2, :]
+ # batch_size height/2 width/2 4*num_channels
+ input_feature = tf.concat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+ input_feature = tf.reshape(
+ input_feature, (batch_size, -1, 4 * num_channels)
+ ) # batch_size height/2*width/2 4*C
+
+ input_feature = self.norm(input_feature, training=training)
+ input_feature = self.reduction(input_feature, training=training)
+
+ return input_feature
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "reduction", None) is not None:
+ with tf.name_scope(self.reduction.name):
+ self.reduction.build([None, None, 4 * self.dim])
+ if getattr(self, "norm", None) is not None:
+ with tf.name_scope(self.norm.name):
+ self.norm.build([None, None, 4 * self.dim])
+
+
+class TFSwinDropPath(keras.layers.Layer):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob: float = None, scale_by_keep: bool = True, **kwargs) -> None:
+ super(TFSwinDropPath, self).__init__(**kwargs)
+ self.drop_prob = drop_prob
+ self.scale_by_keep = scale_by_keep
+
+ def call(self, input: tf.Tensor, training: bool = False) -> tf.Tensor:
+ return drop_path(input, self.drop_prob, training, self.scale_by_keep)
+
+
+class TFSwinSelfAttention(keras.layers.Layer):
+ def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None:
+ super().__init__(**kwargs)
+ if dim % num_heads != 0:
+ raise ValueError(
+ f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+ )
+
+ self.num_attention_heads = num_heads
+ self.attention_head_size = int(dim / num_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ window_size = config.window_size
+ self.window_size = (
+ window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
+ )
+
+ self.query = keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=config.qkv_bias,
+ name="query",
+ )
+ self.key = keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=config.qkv_bias,
+ name="key",
+ )
+ self.value = keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=config.qkv_bias,
+ name="value",
+ )
+
+ self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
+
+ def build(self, input_shape: tf.TensorShape) -> None:
+ self.relative_position_bias_table = self.add_weight(
+ shape=(((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1)), self.num_attention_heads),
+ initializer="zeros",
+ name="relative_position_bias_table",
+ )
+ self.relative_position_index = self.add_weight(
+ shape=(self.window_size[0] ** 2, self.window_size[1] ** 2),
+ trainable=False,
+ dtype=tf.int32,
+ name="relative_position_index",
+ )
+
+ # get pair-wise relative position index for each token inside the window
+ coords_h = tf.range(self.window_size[0])
+ coords_w = tf.range(self.window_size[1])
+ coords = tf.stack(tf.meshgrid(coords_h, coords_w, indexing="ij"))
+ coords_flatten = tf.reshape(coords, (shape_list(coords)[0], -1))
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+ relative_coords = tf.transpose(relative_coords, (1, 2, 0))
+
+ stack_0, stack_1 = tf.unstack(relative_coords, axis=2)
+ stack_0 += self.window_size[0] - 1
+ stack_0 *= 2 * self.window_size[1] - 1
+ stack_1 += self.window_size[1] - 1
+ relative_coords = tf.stack([stack_0, stack_1], axis=2)
+
+ self.relative_position_index.assign(tf.cast(tf.reduce_sum(relative_coords, axis=-1), tf.int32))
+
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "query", None) is not None:
+ with tf.name_scope(self.query.name):
+ self.query.build([None, None, self.all_head_size])
+ if getattr(self, "key", None) is not None:
+ with tf.name_scope(self.key.name):
+ self.key.build([None, None, self.all_head_size])
+ if getattr(self, "value", None) is not None:
+ with tf.name_scope(self.value.name):
+ self.value.build([None, None, self.all_head_size])
+
+ def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
+ new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size]
+ x = tf.reshape(x, new_x_shape)
+ return tf.transpose(x, (0, 2, 1, 3))
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor, ...]:
+ batch_size, dim, _ = shape_list(hidden_states)
+ mixed_query_layer = self.query(hidden_states)
+
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, (0, 1, 3, 2)))
+
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+ relative_position_bias = tf.gather(
+ self.relative_position_bias_table, tf.reshape(self.relative_position_index, (-1,))
+ )
+ relative_position_bias = tf.reshape(
+ relative_position_bias,
+ (self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1),
+ )
+
+ relative_position_bias = tf.transpose(relative_position_bias, (2, 0, 1))
+ attention_scores = attention_scores + tf.expand_dims(relative_position_bias, 0)
+
+ if attention_mask is not None:
+ # Apply the attention mask is (precomputed for all layers in SwinModel call() function)
+ mask_shape = shape_list(attention_mask)[0]
+ attention_scores = tf.reshape(
+ attention_scores, (batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim)
+ )
+ attention_mask = tf.expand_dims(attention_mask, 1)
+ attention_mask = tf.expand_dims(attention_mask, 0)
+ attention_scores = attention_scores + attention_mask
+ attention_scores = tf.reshape(attention_scores, (-1, self.num_attention_heads, dim, dim))
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(attention_probs, training=training)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = attention_probs * head_mask
+
+ context_layer = tf.matmul(attention_probs, value_layer)
+ context_layer = tf.transpose(context_layer, (0, 2, 1, 3))
+ new_context_layer_shape = shape_list(context_layer)[:-2] + [
+ self.all_head_size,
+ ]
+ context_layer = tf.reshape(context_layer, new_context_layer_shape)
+
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+ return outputs
+
+
+class TFSwinSelfOutput(keras.layers.Layer):
+ def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(dim, name="dense")
+ self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout")
+ self.dim = dim
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states, training=training)
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.dim])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+
+class TFSwinAttention(keras.layers.Layer):
+ def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None:
+ super().__init__(**kwargs)
+ self.self = TFSwinSelfAttention(config, dim, num_heads, name="self")
+ self.self_output = TFSwinSelfOutput(config, dim, name="output")
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ """
+ Prunes heads of the model. See base class PreTrainedModel heads: dict of {layer_num: list of heads to prune in
+ this layer}
+ """
+ raise NotImplementedError
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> tf.Tensor:
+ self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions, training=training)
+ attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "self", None) is not None:
+ with tf.name_scope(self.self.name):
+ self.self.build(None)
+ if getattr(self, "self_output", None) is not None:
+ with tf.name_scope(self.self_output.name):
+ self.self_output.build(None)
+
+
+class TFSwinIntermediate(keras.layers.Layer):
+ def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(int(config.mlp_ratio * dim), name="dense")
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+ self.dim = dim
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.dim])
+
+
+class TFSwinOutput(keras.layers.Layer):
+ def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(dim, name="dense")
+ self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, "dropout")
+ self.config = config
+ self.dim = dim
+
+ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states, training=training)
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, int(self.config.mlp_ratio * self.dim)])
+
+
+class TFSwinLayer(keras.layers.Layer):
+ def __init__(
+ self,
+ config,
+ dim,
+ input_resolution: Tuple[int, int],
+ num_heads: int,
+ drop_path_rate: float = 0.0,
+ shift_size: int = 0,
+ **kwargs,
+ ) -> None:
+ super().__init__(**kwargs)
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ min_res = tf.reduce_min(input_resolution)
+ self.window_size = min_res if min_res <= config.window_size else config.window_size
+ self.shift_size = 0 if min_res <= self.window_size else shift_size
+ self.input_resolution = input_resolution
+
+ self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
+ self.attention = TFSwinAttention(config, dim, num_heads, name="attention")
+ self.drop_path = (
+ TFSwinDropPath(drop_path_rate, name="drop_path")
+ if drop_path_rate > 0.0
+ else keras.layers.Activation("linear", name="drop_path")
+ )
+ self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
+ self.intermediate = TFSwinIntermediate(config, dim, name="intermediate")
+ self.swin_output = TFSwinOutput(config, dim, name="output")
+ self.dim = dim
+
+ def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> tf.Tensor | None:
+ img_mask = tf.zeros((height, width))
+ height_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))
+ width_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))
+
+ # calculate attention mask for SW-MSA
+ if shift_size > 0:
+ count = 0
+ for height_slice in height_slices:
+ for width_slice in width_slices:
+ height_inds = tf.range(height_slice[0] % height, height_slice[1] % height + 1)
+ width_inds = tf.range(width_slice[0] % width, width_slice[1] % width + 1)
+ indices = tf.reshape(tf.stack(tf.meshgrid(height_inds, width_inds), axis=-1), (-1, 2))
+ if len(indices) >= 1:
+ updates = tf.ones((len(indices),), dtype=img_mask.dtype) * count
+ img_mask = tf.tensor_scatter_nd_update(img_mask, indices, updates)
+ count += 1
+
+ img_mask = tf.expand_dims(img_mask, -1)
+ img_mask = tf.expand_dims(img_mask, 0)
+
+ mask_windows = window_partition(img_mask, window_size)
+ mask_windows = tf.reshape(mask_windows, (-1, window_size * window_size))
+ attn_mask = tf.expand_dims(mask_windows, 1) - tf.expand_dims(mask_windows, 2)
+ attn_mask = tf.where(attn_mask != 0, float(-100.0), attn_mask)
+ attn_mask = tf.where(attn_mask == 0, float(0.0), attn_mask)
+ return attn_mask
+
+ def maybe_pad(
+ self, hidden_states: tf.Tensor, window_size: int, height: int, width: int
+ ) -> Tuple[tf.Tensor, tf.Tensor]:
+ pad_right = (window_size - width % window_size) % window_size
+ pad_bottom = (window_size - height % window_size) % window_size
+ pad_values = [[0, 0], [0, pad_bottom], [0, pad_right], [0, 0]]
+ hidden_states = tf.pad(hidden_states, pad_values)
+ pad_values = tf.reshape(pad_values, (-1,))
+ return hidden_states, pad_values
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ input_dimensions: Tuple[int, int],
+ head_mask: tf.Tensor | None = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> tf.Tensor:
+ # if window size is larger than input resolution, we don't partition windows
+ min_res = tf.reduce_min(input_dimensions)
+ shift_size = 0 if min_res <= self.window_size else self.shift_size
+ window_size = min_res if min_res <= self.window_size else self.window_size
+
+ height, width = input_dimensions
+ batch_size, _, channels = shape_list(hidden_states)
+ shortcut = hidden_states
+
+ hidden_states = self.layernorm_before(hidden_states, training=training)
+ hidden_states = tf.reshape(hidden_states, (batch_size, height, width, channels))
+ # pad hidden_states to multiples of window size
+ hidden_states, pad_values = self.maybe_pad(hidden_states, window_size, height, width)
+
+ _, height_pad, width_pad, _ = shape_list(hidden_states)
+ # cyclic shift
+ if shift_size > 0:
+ shifted_hidden_states = tf.roll(hidden_states, shift=(-shift_size, -shift_size), axis=(1, 2))
+ else:
+ shifted_hidden_states = hidden_states
+
+ # partition windows
+ hidden_states_windows = window_partition(shifted_hidden_states, window_size)
+ hidden_states_windows = tf.reshape(hidden_states_windows, (-1, window_size * window_size, channels))
+ attn_mask = self.get_attn_mask(
+ height=height_pad, width=width_pad, window_size=window_size, shift_size=shift_size
+ )
+
+ attention_outputs = self.attention(
+ hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions, training=training
+ )
+
+ attention_output = attention_outputs[0]
+
+ attention_windows = tf.reshape(attention_output, (-1, window_size, window_size, channels))
+ shifted_windows = window_reverse(attention_windows, window_size, height_pad, width_pad)
+
+ # reverse cyclic shift
+ if shift_size > 0:
+ attention_windows = tf.roll(shifted_windows, shift=(shift_size, shift_size), axis=(1, 2))
+ else:
+ attention_windows = shifted_windows
+
+ was_padded = pad_values[3] > 0 or pad_values[5] > 0
+ if was_padded:
+ attention_windows = attention_windows[:, :height, :width, :]
+
+ attention_windows = tf.reshape(attention_windows, (batch_size, height * width, channels))
+
+ hidden_states = shortcut + self.drop_path(attention_windows, training=training)
+
+ layer_output = self.layernorm_after(hidden_states, training=training)
+ layer_output = self.intermediate(layer_output)
+ layer_output = hidden_states + self.swin_output(layer_output, training=training)
+
+ layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+ return layer_outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "layernorm_before", None) is not None:
+ with tf.name_scope(self.layernorm_before.name):
+ self.layernorm_before.build([None, None, self.dim])
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "drop_path", None) is not None:
+ with tf.name_scope(self.drop_path.name):
+ self.drop_path.build(None)
+ if getattr(self, "layernorm_after", None) is not None:
+ with tf.name_scope(self.layernorm_after.name):
+ self.layernorm_after.build([None, None, self.dim])
+ if getattr(self, "intermediate", None) is not None:
+ with tf.name_scope(self.intermediate.name):
+ self.intermediate.build(None)
+ if getattr(self, "swin_output", None) is not None:
+ with tf.name_scope(self.swin_output.name):
+ self.swin_output.build(None)
+
+
+class TFSwinStage(keras.layers.Layer):
+ def __init__(
+ self,
+ config: SwinConfig,
+ dim: int,
+ input_resolution: Tuple[int, int],
+ depth: int,
+ num_heads: int,
+ drop_path: List[float],
+ downsample: Optional[Callable],
+ **kwargs,
+ ) -> None:
+ super().__init__(**kwargs)
+ self.config = config
+ self.dim = dim
+ self.blocks = [
+ TFSwinLayer(
+ config=config,
+ dim=dim,
+ input_resolution=input_resolution,
+ num_heads=num_heads,
+ shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+ drop_path_rate=drop_path[i],
+ name=f"blocks.{i}",
+ )
+ for i in range(depth)
+ ]
+
+ # patch merging layer
+ if downsample is not None:
+ self.downsample = downsample(
+ input_resolution,
+ dim=dim,
+ norm_layer=partial(keras.layers.LayerNormalization, epsilon=1e-5),
+ name="downsample",
+ )
+ else:
+ self.downsample = None
+
+ self.pointing = False
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ input_dimensions: Tuple[int, int],
+ head_mask: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = False,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor, ...]:
+ height, width = input_dimensions
+ for i, layer_module in enumerate(self.blocks):
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+
+ layer_outputs = layer_module(
+ hidden_states, input_dimensions, layer_head_mask, output_attentions, training=training
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if self.downsample is not None:
+ height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+ output_dimensions = (height, width, height_downsampled, width_downsampled)
+ hidden_states = self.downsample(layer_outputs[0], input_dimensions, training=training)
+ else:
+ output_dimensions = (height, width, height, width)
+
+ stage_outputs = (hidden_states, output_dimensions)
+
+ if output_attentions:
+ stage_outputs += layer_outputs[1:]
+ return stage_outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "downsample", None) is not None:
+ with tf.name_scope(self.downsample.name):
+ self.downsample.build(None)
+ if getattr(self, "blocks", None) is not None:
+ for layer in self.blocks:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+class TFSwinEncoder(keras.layers.Layer):
+ def __init__(self, config: SwinConfig, grid_size: Tuple[int, int], **kwargs):
+ super().__init__(**kwargs)
+ self.num_layers = len(config.depths)
+ self.config = config
+ dpr = list((tf.linspace(0, 1, sum(config.depths)) * config.drop_path_rate).numpy())
+ self.layers = [
+ TFSwinStage(
+ config=config,
+ dim=int(config.embed_dim * 2**i_layer),
+ input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+ depth=config.depths[i_layer],
+ num_heads=config.num_heads[i_layer],
+ drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+ downsample=TFSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
+ name=f"layers.{i_layer}",
+ )
+ for i_layer in range(self.num_layers)
+ ]
+
+ self.gradient_checkpointing = False
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ input_dimensions: Tuple[int, int],
+ head_mask: tf.Tensor | None = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ training: bool = False,
+ ) -> Union[Tuple[tf.Tensor, ...], TFSwinEncoderOutput]:
+ all_input_dimensions = ()
+ all_hidden_states = () if output_hidden_states else None
+ all_reshaped_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ if output_hidden_states:
+ batch_size, _, hidden_size = shape_list(hidden_states)
+ # rearrange b (h w) c -> b c h w
+ reshaped_hidden_state = tf.reshape(hidden_states, (batch_size, *input_dimensions, hidden_size))
+ reshaped_hidden_state = tf.transpose(reshaped_hidden_state, (0, 3, 1, 2))
+ all_hidden_states += (hidden_states,)
+ all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+ for i, layer_module in enumerate(self.layers):
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+
+ layer_outputs = layer_module(
+ hidden_states, input_dimensions, layer_head_mask, output_attentions, training=training
+ )
+
+ hidden_states = layer_outputs[0]
+ output_dimensions = layer_outputs[1]
+
+ input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+ all_input_dimensions += (input_dimensions,)
+
+ if output_hidden_states:
+ batch_size, _, hidden_size = shape_list(hidden_states)
+ # rearrange b (h w) c -> b c h w
+ reshaped_hidden_state = tf.reshape(hidden_states, (batch_size, *input_dimensions, hidden_size))
+ reshaped_hidden_state = tf.transpose(reshaped_hidden_state, (0, 3, 1, 2))
+ all_hidden_states += (hidden_states,)
+ all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+ if output_attentions:
+ all_self_attentions += layer_outputs[2:]
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+ return TFSwinEncoderOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ reshaped_hidden_states=all_reshaped_hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "layers", None) is not None:
+ for layer in self.layers:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+class TFSwinPreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = SwinConfig
+ base_model_prefix = "swin"
+ main_input_name = "pixel_values"
+
+
+SWIN_START_DOCSTRING = r"""
+ This model is a Tensorflow
+ [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a
+ regular Tensorflow Module and refer to the Tensorflow documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`SwinConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SWIN_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+ for details.
+ head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def normalize_data_format(value: str) -> str:
+ """
+ From tensorflow addons
+ https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/utils/keras_utils.py#L71
+ """
+ if value is None:
+ value = keras.backend.image_data_format()
+ data_format = value.lower()
+ if data_format not in {"channels_first", "channels_last"}:
+ raise ValueError(
+ 'The `data_format` argument must be one of "channels_first", "channels_last". Received: ' + str(value)
+ )
+ return data_format
+
+
+class AdaptiveAveragePooling1D(keras.layers.Layer):
+ """
+ Args:
+ Average 1D Pooling with adaptive kernel size.
+ output_size: An integer or tuple/list of a single integer, specifying pooled_features.
+ The new size of output channels.
+ data_format: A string,
+ one of `channels_last` (default) or `channels_first`. The ordering of the dimensions in the inputs.
+ `channels_last` corresponds to inputs with shape `(batch, steps, channels)` while `channels_first` corresponds
+ to inputs with shape `(batch, channels, steps)`.
+ Input shape:
+ - If `data_format='channels_last'`: 3D tensor with shape `(batch, steps, channels)`.
+ - If `data_format='channels_first'`: 3D tensor with shape `(batch, channels, steps)`.
+ Output shape:
+ - If `data_format='channels_last'`: 3D tensor with shape `(batch_size, pooled_steps, channels)`.
+ - If `data_format='channels_first'`: 3D tensor with shape `(batch_size, channels, pooled_steps)`.
+
+ Adapted from [tensorflow-addon's adaptive pooling.py](
+ https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/layers/adaptive_pooling.py#L90-L120
+ )
+ """
+
+ def __init__(
+ self,
+ output_size: Union[int, Iterable[int]],
+ reduce_function: Callable = tf.reduce_mean,
+ data_format: Optional[str] = None,
+ **kwargs,
+ ) -> None:
+ self.data_format = normalize_data_format(data_format)
+ self.reduce_function = reduce_function
+ self.output_size = (output_size,) if isinstance(output_size, int) else tuple(output_size)
+ super().__init__(**kwargs)
+
+ def call(self, inputs: tf.Tensor, *args) -> None:
+ bins = self.output_size[0]
+ if self.data_format == "channels_last":
+ splits = tf.split(inputs, bins, axis=1)
+ splits = tf.stack(splits, axis=1)
+ out_vect = self.reduce_function(splits, axis=2)
+ else:
+ splits = tf.split(inputs, bins, axis=2)
+ splits = tf.stack(splits, axis=2)
+ out_vect = self.reduce_function(splits, axis=3)
+ return out_vect
+
+ def compute_output_shape(self, input_shape: Iterable[int]) -> tf.TensorShape:
+ input_shape = tf.TensorShape(input_shape).as_list()
+ if self.data_format == "channels_last":
+ shape = tf.TensorShape([input_shape[0], self.output_size[0], input_shape[2]])
+ else:
+ shape = tf.TensorShape([input_shape[0], input_shape[1], self.output_size[0]])
+ return shape
+
+ def get_config(self) -> Dict[str, Any]:
+ config = {
+ "output_size": self.output_size,
+ "data_format": self.data_format,
+ }
+ base_config = super().get_config()
+ return {**base_config, **config}
+
+
+@keras_serializable
+class TFSwinMainLayer(keras.layers.Layer):
+ config_class = SwinConfig
+
+ def __init__(
+ self, config: SwinConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
+ ) -> None:
+ super().__init__(**kwargs)
+ self.config = config
+ self.num_layers = len(config.depths)
+ self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
+
+ self.embeddings = TFSwinEmbeddings(config, use_mask_token=use_mask_token, name="embeddings")
+ self.encoder = TFSwinEncoder(config, self.embeddings.patch_grid, name="encoder")
+
+ self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+ self.pooler = AdaptiveAveragePooling1D(output_size=(1,)) if add_pooling_layer else None
+
+ def get_input_embeddings(self) -> TFSwinPatchEmbeddings:
+ return self.embeddings.patch_embeddings
+
+ def _prune_heads(self, heads_to_prune: Dict[int, List]):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ def get_head_mask(self, head_mask: Optional[Any]) -> List:
+ if head_mask is not None:
+ raise NotImplementedError
+ return [None] * len(self.config.depths)
+
+ @unpack_inputs
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ bool_masked_pos: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFSwinModelOutput, Tuple[tf.Tensor, ...]]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask)
+ embedding_output, input_dimensions = self.embeddings(
+ pixel_values, bool_masked_pos=bool_masked_pos, training=training
+ )
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ input_dimensions,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = encoder_outputs[0]
+ sequence_output = self.layernorm(sequence_output, training=training)
+
+ pooled_output = None
+ if self.pooler is not None:
+ batch_size, _, num_features = shape_list(sequence_output)
+ pooled_output = self.pooler(sequence_output)
+ pooled_output = tf.reshape(pooled_output, (batch_size, num_features))
+
+ if not return_dict:
+ output = (sequence_output, pooled_output) + encoder_outputs[1:]
+ return output
+
+ return TFSwinModelOutput(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "embeddings", None) is not None:
+ with tf.name_scope(self.embeddings.name):
+ self.embeddings.build(None)
+ if getattr(self, "encoder", None) is not None:
+ with tf.name_scope(self.encoder.name):
+ self.encoder.build(None)
+ if getattr(self, "layernorm", None) is not None:
+ with tf.name_scope(self.layernorm.name):
+ self.layernorm.build([None, None, self.num_features])
+
+
+@add_start_docstrings(
+ "The bare Swin Model transformer outputting raw hidden-states without any specific head on top.",
+ SWIN_START_DOCSTRING,
+)
+class TFSwinModel(TFSwinPreTrainedModel):
+ def __init__(
+ self, config: SwinConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
+ ) -> None:
+ super().__init__(config, **kwargs)
+ self.config = config
+ self.swin = TFSwinMainLayer(config, name="swin")
+
+ @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFSwinModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="vision",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ @unpack_inputs
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ bool_masked_pos: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFSwinModelOutput, Tuple[tf.Tensor, ...]]:
+ r"""
+ bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ swin_outputs = self.swin(
+ pixel_values=pixel_values,
+ bool_masked_pos=bool_masked_pos,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return swin_outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "swin", None) is not None:
+ with tf.name_scope(self.swin.name):
+ self.swin.build(None)
+
+
+class TFSwinPixelShuffle(keras.layers.Layer):
+ """TF layer implementation of torch.nn.PixelShuffle"""
+
+ def __init__(self, upscale_factor: int, **kwargs) -> None:
+ super().__init__(**kwargs)
+ if not isinstance(upscale_factor, int) or upscale_factor < 2:
+ raise ValueError(f"upscale_factor must be an integer value >= 2 got {upscale_factor}")
+ self.upscale_factor = upscale_factor
+
+ def call(self, x: tf.Tensor) -> tf.Tensor:
+ hidden_states = x
+ batch_size, _, _, num_input_channels = shape_list(hidden_states)
+ block_size_squared = self.upscale_factor**2
+ output_depth = int(num_input_channels / block_size_squared)
+ # When the number of output channels >= 2, PyTorch's PixelShuffle and
+ # TF's depth_to_space differ in their output as the order of channels selected for combining
+ # is a permutation of the other c.f.
+ # https://stackoverflow.com/questions/68272502/tf-depth-to-space-not-same-as-torchs-pixelshuffle-when-output-channels-1
+ permutation = tf.constant(
+ [[i + j * block_size_squared for i in range(block_size_squared) for j in range(output_depth)]]
+ )
+ hidden_states = tf.gather(params=hidden_states, indices=tf.tile(permutation, [batch_size, 1]), batch_dims=-1)
+ hidden_states = tf.nn.depth_to_space(hidden_states, block_size=self.upscale_factor, data_format="NHWC")
+ return hidden_states
+
+
+class TFSwinDecoder(keras.layers.Layer):
+ def __init__(self, config: SwinConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.conv2d = keras.layers.Conv2D(
+ filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, strides=1, name="0"
+ )
+ self.pixel_shuffle = TFSwinPixelShuffle(config.encoder_stride, name="1")
+ self.config = config
+
+ def call(self, x: tf.Tensor) -> tf.Tensor:
+ hidden_states = x
+ # B,C,H,W -> B,H,W,C
+ hidden_states = tf.transpose(hidden_states, (0, 2, 3, 1))
+ hidden_states = self.conv2d(hidden_states)
+ hidden_states = self.pixel_shuffle(hidden_states)
+ # B,H,W,C -> B,C,H,W
+ hidden_states = tf.transpose(hidden_states, (0, 3, 1, 2))
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "conv2d", None) is not None:
+ with tf.name_scope(self.conv2d.name):
+ self.conv2d.build([None, None, None, self.config.hidden_size])
+ if getattr(self, "pixel_shuffle", None) is not None:
+ with tf.name_scope(self.pixel_shuffle.name):
+ self.pixel_shuffle.build(None)
+
+
+@add_start_docstrings(
+ "Swin Model with a decoder on top for masked image modeling, as proposed in"
+ " [SimMIM](https://arxiv.org/abs/2111.09886).",
+ SWIN_START_DOCSTRING,
+)
+class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel):
+ def __init__(self, config: SwinConfig):
+ super().__init__(config)
+
+ self.swin = TFSwinMainLayer(config, add_pooling_layer=False, use_mask_token=True, name="swin")
+
+ self.decoder = TFSwinDecoder(config, name="decoder")
+
+ @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFSwinMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
+ @unpack_inputs
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ bool_masked_pos: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[Tuple, TFSwinMaskedImageModelingOutput]:
+ r"""
+ bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+ Returns:
+
+ Examples:
+ ```python
+ >>> from transformers import AutoImageProcessor, TFSwinForMaskedImageModeling
+ >>> import tensorflow as tf
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+ >>> model = TFSwinForMaskedImageModeling.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+
+ >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
+ >>> pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
+ >>> # create random boolean mask of shape (batch_size, num_patches)
+ >>> bool_masked_pos = tf.random.uniform((1, num_patches)) >= 0.5
+
+ >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+ >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
+ >>> list(reconstructed_pixel_values.shape)
+ [1, 3, 224, 224]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.swin(
+ pixel_values,
+ bool_masked_pos=bool_masked_pos,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = outputs[0]
+ # Reshape to (batch_size, num_channels, height, width)
+ sequence_output = tf.transpose(sequence_output, (0, 2, 1))
+ batch_size, num_channels, sequence_length = shape_list(sequence_output)
+ height = width = int(sequence_length**0.5)
+ sequence_output = tf.reshape(sequence_output, (batch_size, num_channels, height, width))
+
+ # Reconstruct pixel values
+ reconstructed_pixel_values = self.decoder(sequence_output)
+
+ masked_im_loss = None
+ if bool_masked_pos is not None:
+ size = self.config.image_size // self.config.patch_size
+ bool_masked_pos = tf.reshape(bool_masked_pos, (-1, size, size))
+ mask = tf.repeat(bool_masked_pos, self.config.patch_size, 1)
+ mask = tf.repeat(mask, self.config.patch_size, 2)
+ mask = tf.expand_dims(mask, 1)
+ mask = tf.cast(mask, tf.float32)
+
+ reconstruction_loss = keras.losses.mean_absolute_error(
+ # Swap axes as metric calculation reduces over the final dimension
+ tf.transpose(pixel_values, (1, 2, 3, 0)),
+ tf.transpose(reconstructed_pixel_values, (1, 2, 3, 0)),
+ )
+ reconstruction_loss = tf.expand_dims(reconstruction_loss, 0)
+ total_loss = tf.reduce_sum(reconstruction_loss * mask)
+ num_masked_pixels = (tf.reduce_sum(mask) + 1e-5) * self.config.num_channels
+ masked_im_loss = total_loss / num_masked_pixels
+ masked_im_loss = tf.reshape(masked_im_loss, (1,))
+
+ if not return_dict:
+ output = (reconstructed_pixel_values,) + outputs[2:]
+ return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+
+ return TFSwinMaskedImageModelingOutput(
+ loss=masked_im_loss,
+ reconstruction=reconstructed_pixel_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ reshaped_hidden_states=outputs.reshaped_hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "swin", None) is not None:
+ with tf.name_scope(self.swin.name):
+ self.swin.build(None)
+ if getattr(self, "decoder", None) is not None:
+ with tf.name_scope(self.decoder.name):
+ self.decoder.build(None)
+
+
+@add_start_docstrings(
+ """
+ Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+ the [CLS] token) e.g. for ImageNet.
+ """,
+ SWIN_START_DOCSTRING,
+)
+class TFSwinForImageClassification(TFSwinPreTrainedModel, TFSequenceClassificationLoss):
+ def __init__(self, config: SwinConfig):
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.swin = TFSwinMainLayer(config, name="swin")
+
+ # Classifier head
+ self.classifier = (
+ keras.layers.Dense(config.num_labels, name="classifier")
+ if config.num_labels > 0
+ else keras.layers.Activation("linear", name="classifier")
+ )
+
+ @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
+ output_type=TFSwinImageClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+ )
+ @unpack_inputs
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ labels: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[Tuple[tf.Tensor, ...], TFSwinImageClassifierOutput]:
+ r"""
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.swin(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ pooled_output = outputs[1]
+
+ logits = self.classifier(pooled_output, training=training)
+
+ loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFSwinImageClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ reshaped_hidden_states=outputs.reshaped_hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "swin", None) is not None:
+ with tf.name_scope(self.swin.name):
+ self.swin.build(None)
+ if getattr(self, "classifier", None) is not None:
+ if hasattr(self.classifier, "name"):
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.swin.num_features])
+
+
+__all__ = ["TFSwinForImageClassification", "TFSwinForMaskedImageModeling", "TFSwinModel", "TFSwinPreTrainedModel"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/tvp/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/tvp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7388075201e03c4ee182310085686cef1c6b0256
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/tvp/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_tvp import *
+ from .image_processing_tvp import *
+ from .modeling_tvp import *
+ from .processing_tvp import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/tvp/configuration_tvp.py b/.venv/lib/python3.11/site-packages/transformers/models/tvp/configuration_tvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..803acb220d081fe3140dafd8843491c61d396004
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/tvp/configuration_tvp.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TVP model configuration"""
+
+import copy
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class TvpConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`TvpModel`]. It is used to instantiate an Tvp
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the Tvp
+ [Intel/tvp-base](https://huggingface.co/Intel/tvp-base) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ backbone_config (`PretrainedConfig` or `dict`, *optional*):
+ The configuration of the backbone model.
+ backbone (`str`, *optional*):
+ Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+ will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+ is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+ use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+ Whether to use pretrained weights for the backbone.
+ use_timm_backbone (`bool`, *optional*, defaults to `False`):
+ Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+ library.
+ backbone_kwargs (`dict`, *optional*):
+ Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+ e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+ distance_loss_weight (`float`, *optional*, defaults to 1.0):
+ The weight of distance loss.
+ duration_loss_weight (`float`, *optional*, defaults to 0.1):
+ The weight of duration loss.
+ visual_prompter_type (`str`, *optional*, defaults to `"framepad"`):
+ Visual prompt type. The type of padding. Framepad means padding on each frame. Should be one of "framepad"
+ or "framedownpad"
+ visual_prompter_apply (`str`, *optional*, defaults to `"replace"`):
+ The way of applying visual prompt. Replace means use the value of prompt to change the original value in
+ visual inputs. Should be one of "replace", or "add", or "remove".
+ visual_prompt_size (`int`, *optional*, defaults to 96):
+ The size of visual prompt.
+ max_img_size (`int`, *optional*, defaults to 448):
+ The maximum size of frame.
+ num_frames (`int`, *optional*, defaults to 48):
+ The number of frames extracted from a video.
+ vocab_size (`int`, *optional*, defaults to 30522):
+ Vocabulary size of the Tvp text model. Defines the number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`TvpModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ max_position_embeddings (`int`, *optional*, defaults to 512):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ max_grid_col_position_embeddings (`int`, *optional*, defaults to 100):
+ The largest number of horizontal patches from a video frame.
+ max_grid_row_position_embeddings (`int`, *optional*, defaults to 100):
+ The largest number of vertical patches from a video frame.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+ The dropout probability of hidden layers.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the layer normalization layers.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+ The dropout probability of attention layers.
+ """
+
+ model_type = "tvp"
+
+ def __init__(
+ self,
+ backbone_config=None,
+ backbone=None,
+ use_pretrained_backbone=False,
+ use_timm_backbone=False,
+ backbone_kwargs=None,
+ distance_loss_weight=1.0,
+ duration_loss_weight=0.1,
+ visual_prompter_type="framepad",
+ visual_prompter_apply="replace",
+ visual_prompt_size=96,
+ max_img_size=448,
+ num_frames=48,
+ vocab_size=30522,
+ hidden_size=768,
+ intermediate_size=3072,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ max_position_embeddings=512,
+ max_grid_col_position_embeddings=100,
+ max_grid_row_position_embeddings=100,
+ hidden_dropout_prob=0.1,
+ hidden_act="gelu",
+ layer_norm_eps=1e-12,
+ initializer_range=0.02,
+ attention_probs_dropout_prob=0.1,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ if backbone_config is None and backbone is None:
+ logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+ backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+ elif isinstance(backbone_config, dict):
+ backbone_model_type = backbone_config.get("model_type")
+ config_class = CONFIG_MAPPING[backbone_model_type]
+ backbone_config = config_class.from_dict(backbone_config)
+
+ verify_backbone_config_arguments(
+ use_timm_backbone=use_timm_backbone,
+ use_pretrained_backbone=use_pretrained_backbone,
+ backbone=backbone,
+ backbone_config=backbone_config,
+ backbone_kwargs=backbone_kwargs,
+ )
+
+ self.backbone_config = backbone_config
+ self.backbone = backbone
+ self.use_pretrained_backbone = use_pretrained_backbone
+ self.use_timm_backbone = use_timm_backbone
+ self.backbone_kwargs = backbone_kwargs
+ self.distance_loss_weight = distance_loss_weight
+ self.duration_loss_weight = duration_loss_weight
+ self.visual_prompter_type = visual_prompter_type
+ self.visual_prompter_apply = visual_prompter_apply
+ self.visual_prompt_size = visual_prompt_size
+ self.max_img_size = max_img_size
+ self.num_frames = num_frames
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.max_position_embeddings = max_position_embeddings
+ self.max_grid_col_position_embeddings = max_grid_col_position_embeddings
+ self.max_grid_row_position_embeddings = max_grid_row_position_embeddings
+ self.layer_norm_eps = layer_norm_eps
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+
+ @classmethod
+ def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
+ """Instantiate a [`TvpConfig`] (or a derived class) from a pre-trained backbone model configuration.
+
+ Args:
+ backbone_config ([`PretrainedConfig`]):
+ The backbone configuration.
+ Returns:
+ [`TvpConfig`]: An instance of a configuration object
+ """
+ return cls(backbone_config=backbone_config, **kwargs)
+
+ def to_dict(self):
+ """
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+ Returns:
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+ """
+ output = copy.deepcopy(self.__dict__)
+ if output["backbone_config"] is not None:
+ output["backbone_config"] = self.backbone_config.to_dict()
+ output["model_type"] = self.__class__.model_type
+ return output
+
+
+__all__ = ["TvpConfig"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/tvp/image_processing_tvp.py b/.venv/lib/python3.11/site-packages/transformers/models/tvp/image_processing_tvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..115433c6c48b75b223af83ddb16a2d1764d5fa82
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/tvp/image_processing_tvp.py
@@ -0,0 +1,481 @@
+# coding=utf-8
+# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for TVP."""
+
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+ PaddingMode,
+ flip_channel_order,
+ pad,
+ resize,
+ to_channel_dimension_format,
+)
+from ...image_utils import (
+ IMAGENET_STANDARD_MEAN,
+ IMAGENET_STANDARD_STD,
+ ChannelDimension,
+ ImageInput,
+ PILImageResampling,
+ get_image_size,
+ is_valid_image,
+ to_numpy_array,
+ valid_images,
+ validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+if is_vision_available():
+ import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.vivit.image_processing_vivit.make_batched
+def make_batched(videos) -> List[List[ImageInput]]:
+ if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+ return videos
+
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+ return [videos]
+
+ elif is_valid_image(videos):
+ return [[videos]]
+
+ raise ValueError(f"Could not make batched video from {videos}")
+
+
+def get_resize_output_image_size(
+ input_image: np.ndarray,
+ max_size: int = 448,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+ height, width = get_image_size(input_image, input_data_format)
+ if height >= width:
+ ratio = width * 1.0 / height
+ new_height = max_size
+ new_width = new_height * ratio
+ else:
+ ratio = height * 1.0 / width
+ new_width = max_size
+ new_height = new_width * ratio
+ size = (int(new_height), int(new_width))
+
+ return size
+
+
+class TvpImageProcessor(BaseImageProcessor):
+ r"""
+ Constructs a Tvp image processor.
+
+ Args:
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+ `do_resize` parameter in the `preprocess` method.
+ size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 448}`):
+ Size of the output image after resizing. The longest edge of the image will be resized to
+ `size["longest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
+ `size` in the `preprocess` method.
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+ Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+ `preprocess` method.
+ do_center_crop (`bool`, *optional*, defaults to `True`):
+ Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+ parameter in the `preprocess` method.
+ crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 448, "width": 448}`):
+ Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+ `preprocess` method.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+ parameter in the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+ in the `preprocess` method.
+ do_pad (`bool`, *optional*, defaults to `True`):
+ Whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` method.
+ pad_size (`Dict[str, int]`, *optional*, defaults to `{"height": 448, "width": 448}`):
+ Size of the image after applying the padding. Can be overridden by the `pad_size` parameter in the
+ `preprocess` method.
+ constant_values (`Union[float, Iterable[float]]`, *optional*, defaults to 0):
+ The fill value to use when padding the image.
+ pad_mode (`PaddingMode`, *optional*, defaults to `PaddingMode.CONSTANT`):
+ Use what kind of mode in padding.
+ do_normalize (`bool`, *optional*, defaults to `True`):
+ Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+ method.
+ do_flip_channel_order (`bool`, *optional*, defaults to `True`):
+ Whether to flip the color channels from RGB to BGR. Can be overridden by the `do_flip_channel_order`
+ parameter in the `preprocess` method.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ """
+
+ model_input_names = ["pixel_values"]
+
+ def __init__(
+ self,
+ do_resize: bool = True,
+ size: Dict[str, int] = None,
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
+ do_center_crop: bool = True,
+ crop_size: Dict[str, int] = None,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_pad: bool = True,
+ pad_size: Dict[str, int] = None,
+ constant_values: Union[float, Iterable[float]] = 0,
+ pad_mode: PaddingMode = PaddingMode.CONSTANT,
+ do_normalize: bool = True,
+ do_flip_channel_order: bool = True,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ **kwargs,
+ ) -> None:
+ super().__init__(**kwargs)
+ size = size if size is not None else {"longest_edge": 448}
+ crop_size = crop_size if crop_size is not None else {"height": 448, "width": 448}
+ pad_size = pad_size if pad_size is not None else {"height": 448, "width": 448}
+
+ self.do_resize = do_resize
+ self.size = size
+ self.do_center_crop = do_center_crop
+ self.crop_size = crop_size
+ self.resample = resample
+ self.do_rescale = do_rescale
+ self.rescale_factor = rescale_factor
+ self.do_pad = do_pad
+ self.pad_size = pad_size
+ self.constant_values = constant_values
+ self.pad_mode = pad_mode
+ self.do_normalize = do_normalize
+ self.do_flip_channel_order = do_flip_channel_order
+ self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+ self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+ def resize(
+ self,
+ image: np.ndarray,
+ size: Dict[str, int],
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Resize an image.
+
+ Args:
+ image (`np.ndarray`):
+ Image to resize.
+ size (`Dict[str, int]`):
+ Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+ have the size `(h, w)`. If `size` is of the form `{"longest_edge": s}`, the output image will have its
+ longest edge of length `s` while keeping the aspect ratio of the original image.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+ Resampling filter to use when resiizing the image.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format of the image. If not provided, it will be the same as the input image.
+ input_data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred.
+ """
+ size = get_size_dict(size, default_to_square=False)
+ if "height" in size and "width" in size:
+ output_size = (size["height"], size["width"])
+ elif "longest_edge" in size:
+ output_size = get_resize_output_image_size(image, size["longest_edge"], input_data_format)
+ else:
+ raise ValueError(f"Size must have 'height' and 'width' or 'longest_edge' as keys. Got {size.keys()}")
+
+ return resize(
+ image,
+ size=output_size,
+ resample=resample,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ **kwargs,
+ )
+
+ def pad_image(
+ self,
+ image: np.ndarray,
+ pad_size: Dict[str, int] = None,
+ constant_values: Union[float, Iterable[float]] = 0,
+ pad_mode: PaddingMode = PaddingMode.CONSTANT,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ):
+ """
+ Pad an image with zeros to the given size.
+
+ Args:
+ image (`np.ndarray`):
+ Image to pad.
+ pad_size (`Dict[str, int]`)
+ Size of the output image with pad.
+ constant_values (`Union[float, Iterable[float]]`)
+ The fill value to use when padding the image.
+ pad_mode (`PaddingMode`)
+ The pad mode, default to PaddingMode.CONSTANT
+ data_format (`ChannelDimension` or `str`, *optional*)
+ The channel dimension format of the image. If not provided, it will be the same as the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred.
+ """
+ height, width = get_image_size(image, channel_dim=input_data_format)
+ max_height = pad_size.get("height", height)
+ max_width = pad_size.get("width", width)
+
+ pad_right, pad_bottom = max_width - width, max_height - height
+ if pad_right < 0 or pad_bottom < 0:
+ raise ValueError("The padding size must be greater than image size")
+
+ padding = ((0, pad_bottom), (0, pad_right))
+ padded_image = pad(
+ image,
+ padding,
+ mode=pad_mode,
+ constant_values=constant_values,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ )
+
+ return padded_image
+
+ def _preprocess_image(
+ self,
+ image: ImageInput,
+ do_resize: bool = None,
+ size: Dict[str, int] = None,
+ resample: PILImageResampling = None,
+ do_center_crop: bool = None,
+ crop_size: Dict[str, int] = None,
+ do_rescale: bool = None,
+ rescale_factor: float = None,
+ do_pad: bool = True,
+ pad_size: Dict[str, int] = None,
+ constant_values: Union[float, Iterable[float]] = None,
+ pad_mode: PaddingMode = None,
+ do_normalize: bool = None,
+ do_flip_channel_order: bool = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """Preprocesses a single image."""
+
+ validate_preprocess_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_pad=do_pad,
+ size_divisibility=pad_size, # here the pad() method simply requires the pad_size argument.
+ do_center_crop=do_center_crop,
+ crop_size=crop_size,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ )
+
+ # All transformations expect numpy arrays.
+ image = to_numpy_array(image)
+
+ if do_resize:
+ image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+ if do_center_crop:
+ image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
+
+ if do_rescale:
+ image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+ if do_normalize:
+ image = self.normalize(
+ image=image.astype(np.float32), mean=image_mean, std=image_std, input_data_format=input_data_format
+ )
+
+ if do_pad:
+ image = self.pad_image(
+ image=image,
+ pad_size=pad_size,
+ constant_values=constant_values,
+ pad_mode=pad_mode,
+ input_data_format=input_data_format,
+ )
+
+ # the pretrained checkpoints assume images are BGR, not RGB
+ if do_flip_channel_order:
+ image = flip_channel_order(image=image, input_data_format=input_data_format)
+
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+ return image
+
+ @filter_out_non_signature_kwargs()
+ def preprocess(
+ self,
+ videos: Union[ImageInput, List[ImageInput], List[List[ImageInput]]],
+ do_resize: bool = None,
+ size: Dict[str, int] = None,
+ resample: PILImageResampling = None,
+ do_center_crop: bool = None,
+ crop_size: Dict[str, int] = None,
+ do_rescale: bool = None,
+ rescale_factor: float = None,
+ do_pad: bool = None,
+ pad_size: Dict[str, int] = None,
+ constant_values: Union[float, Iterable[float]] = None,
+ pad_mode: PaddingMode = None,
+ do_normalize: bool = None,
+ do_flip_channel_order: bool = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ data_format: ChannelDimension = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> PIL.Image.Image:
+ """
+ Preprocess an image or batch of images.
+
+ Args:
+ videos (`ImageInput` or `List[ImageInput]` or `List[List[ImageInput]]`):
+ Frames to preprocess.
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+ Whether to resize the image.
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+ Size of the image after applying resize.
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+ has an effect if `do_resize` is set to `True`.
+ do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+ Whether to centre crop the image.
+ crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+ Size of the image after applying the centre crop.
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+ Whether to rescale the image values between [0 - 1].
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+ do_pad (`bool`, *optional*, defaults to `True`):
+ Whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` method.
+ pad_size (`Dict[str, int]`, *optional*, defaults to `{"height": 448, "width": 448}`):
+ Size of the image after applying the padding. Can be overridden by the `pad_size` parameter in the
+ `preprocess` method.
+ constant_values (`Union[float, Iterable[float]]`, *optional*, defaults to 0):
+ The fill value to use when padding the image.
+ pad_mode (`PaddingMode`, *optional*, defaults to "PaddingMode.CONSTANT"):
+ Use what kind of mode in padding.
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+ Whether to normalize the image.
+ do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`):
+ Whether to flip the channel order of the image.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+ Image mean.
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+ Image standard deviation.
+ return_tensors (`str` or `TensorType`, *optional*):
+ The type of tensors to return. Can be one of:
+ - Unset: Return a list of `np.ndarray`.
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the inferred channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ """
+ do_resize = do_resize if do_resize is not None else self.do_resize
+ resample = resample if resample is not None else self.resample
+ do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+ do_pad = do_pad if do_pad is not None else self.do_pad
+ pad_size = pad_size if pad_size is not None else self.pad_size
+ constant_values = constant_values if constant_values is not None else self.constant_values
+ pad_mode = pad_mode if pad_mode else self.pad_mode
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+ do_flip_channel_order = (
+ do_flip_channel_order if do_flip_channel_order is not None else self.do_flip_channel_order
+ )
+ image_mean = image_mean if image_mean is not None else self.image_mean
+ image_std = image_std if image_std is not None else self.image_std
+
+ size = size if size is not None else self.size
+ size = get_size_dict(size, default_to_square=False)
+ crop_size = crop_size if crop_size is not None else self.crop_size
+ crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+ if not valid_images(videos):
+ raise ValueError(
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+ "torch.Tensor, tf.Tensor or jax.ndarray."
+ )
+
+ videos = make_batched(videos)
+
+ videos = [
+ np.array(
+ [
+ self._preprocess_image(
+ image=img,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ do_center_crop=do_center_crop,
+ crop_size=crop_size,
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_pad=do_pad,
+ pad_size=pad_size,
+ constant_values=constant_values,
+ pad_mode=pad_mode,
+ do_normalize=do_normalize,
+ do_flip_channel_order=do_flip_channel_order,
+ image_mean=image_mean,
+ image_std=image_std,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ )
+ for img in video
+ ]
+ )
+ for video in videos
+ ]
+
+ data = {"pixel_values": videos}
+ return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["TvpImageProcessor"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/tvp/modeling_tvp.py b/.venv/lib/python3.11/site-packages/transformers/models/tvp/modeling_tvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d9a0106d9d56958cc17758949d73e85e4b2e9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/tvp/modeling_tvp.py
@@ -0,0 +1,985 @@
+# coding=utf-8
+# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch TVP Model"""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import prune_linear_layer
+from ...utils import logging
+from ...utils.backbone_utils import load_backbone
+from .configuration_tvp import TvpConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class TvpVideoGroundingOutput(ModelOutput):
+ """
+ Args:
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+ Temporal-Distance IoU loss for video grounding.
+ logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+ Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
+ input texts.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+ the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ logits: torch.FloatTensor = None
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+class TvpLoss(nn.Module):
+ """
+ This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
+ hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
+ ground-truth / prediction (supervise class and box).
+
+ Args:
+ losses (`List[str]`):
+ List of all the losses to be applied.
+ """
+
+ def __init__(self, losses):
+ super().__init__()
+ self.loss_map = {
+ "iou": self.loss_iou,
+ "distance": self.loss_distance,
+ "duration": self.loss_duration,
+ }
+ for loss in losses:
+ if loss not in self.loss_map:
+ raise ValueError(f"Loss {loss} not supported")
+
+ self.losses = losses
+
+ def loss_iou(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
+ """
+ Measure the intersection over union.
+ """
+ inter = torch.min(candidates_end_time, end_time) - torch.max(candidates_start_time, start_time)
+ union = torch.max(candidates_end_time, end_time) - torch.min(candidates_start_time, start_time)
+ iou = 1 - inter.clamp(min=0) / union
+
+ return iou
+
+ def loss_distance(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
+ """
+ Measure the distance of mid points.
+ """
+ mid_candidates = torch.div(torch.add(candidates_start_time, candidates_end_time), 2.0)
+ mid_groundtruth = torch.div(torch.add(start_time, end_time), 2.0)
+ distance_diff = torch.div(
+ torch.max(mid_candidates, mid_groundtruth) - torch.min(mid_candidates, mid_groundtruth), duration
+ ).clamp(min=0.2)
+
+ return distance_diff
+
+ def loss_duration(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
+ """
+ Measure the difference of duration.
+ """
+ duration_candidates = torch.sub(candidates_end_time, candidates_start_time)
+ duration_groundtruth = torch.sub(end_time, start_time)
+ duration_diff = torch.square(torch.div(torch.sub(duration_candidates, duration_groundtruth), duration))
+ duration_diff = duration_diff.clamp(min=0.4)
+
+ return duration_diff
+
+ def forward(self, logits, labels):
+ """
+ This performs the loss computation.
+
+ Args:
+ logits (`torch.FloatTensor`):
+ The output logits of head module.
+ labels (`List[torch.FloatTensor]`):
+ List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
+ """
+ duration, start_time, end_time = labels
+ candidates = torch.mul(logits, duration)
+ candidates_start_time, candidates_end_time = candidates[:, 0].float(), candidates[:, 1].float()
+
+ losses_dict = {}
+ for loss in self.losses:
+ losses_dict.update(
+ {loss: self.loss_map[loss](start_time, end_time, candidates_start_time, candidates_end_time, duration)}
+ )
+
+ return losses_dict
+
+
+class TvpVisionModel(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.backbone = load_backbone(config)
+
+ if config.backbone_config is not None:
+ in_channels = config.backbone_config.hidden_sizes[-1]
+ elif hasattr(self.backbone, "config") and hasattr(self.backbone.config, "hidden_sizes"):
+ in_channels = self.backbone.config.hidden_sizes[-1]
+ elif hasattr(self.backbone, "config") and hasattr(self.backbone.config, "hidden_size"):
+ in_channels = self.backbone.config.hidden_size
+ else:
+ raise ValueError("Backbone config not found")
+
+ self.grid_encoder_conv = nn.Conv2d(
+ in_channels,
+ config.hidden_size,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ groups=1,
+ bias=False,
+ )
+
+ def forward(self, pixel_values):
+ batch_size, num_frames, num_channels, height, width = pixel_values.shape
+ # (batch_size * num_frames, num_channels, height, width)
+ pixel_values = pixel_values.view(batch_size * num_frames, num_channels, height, width)
+ grid_feat_outputs = self.backbone(pixel_values)["feature_maps"][0]
+ grid = self.grid_encoder_conv(grid_feat_outputs)
+ grid = nn.functional.max_pool2d(grid, kernel_size=2, stride=2)
+ grid = nn.functional.relu(grid, inplace=True)
+ new_channel, new_height, new_width = grid.shape[-3:]
+ # (batch_size, num_frames, num_channels, height, width)
+ grid = grid.view(batch_size, num_frames, new_channel, new_height, new_width)
+ # (batch_size, num_frames, height, width, num_channels)
+ grid = grid.permute(0, 1, 3, 4, 2)
+ return grid
+
+
+class TvpVisualInputEmbedding(nn.Module):
+ """
+ Takes input of both image and video (multi-frame)
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ # sequence embedding
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+ self.row_position_embeddings = nn.Embedding(config.max_grid_row_position_embeddings, config.hidden_size)
+ self.col_position_embeddings = nn.Embedding(config.max_grid_col_position_embeddings, config.hidden_size)
+ self.token_type_embeddings = nn.Embedding(1, config.hidden_size)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.max_grid_row_position_embeddings = config.max_grid_row_position_embeddings
+ self.max_grid_col_position_embeddings = config.max_grid_col_position_embeddings
+
+ def interpolate_pos_encoding(self, embedding: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ """
+ This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
+ resolution images (high resolution videos).
+
+ """
+ h0 = w0 = 1
+ # if height dimension is to be interpolated
+ if height > self.max_grid_row_position_embeddings:
+ h0 = height / self.max_grid_row_position_embeddings
+ # if width dimension is to be interpolated
+ if width > self.max_grid_col_position_embeddings:
+ w0 = width / self.max_grid_col_position_embeddings
+ embedding = embedding.permute(0, 3, 1, 2) # (batch_size, hidden_dim, height, width)
+ embedding = nn.functional.interpolate(
+ embedding,
+ scale_factor=(h0, w0),
+ mode="bicubic",
+ align_corners=False,
+ )
+ embedding = embedding.permute(0, 2, 3, 1) # (batch_size, height, width, hidden_dim)
+ return embedding
+
+ def add_2d_positional_embeddings(self, grid, interpolate_pos_encoding: bool = False):
+ """
+ Args:
+ grid: (batch_size, height, width, hidden_dim)
+ interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
+ Whether to interpolate the pre-trained position encodings.
+ Returns:
+ grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
+ """
+ batch_size, height, width, hidden_dim = grid.shape
+
+ # add row-wise position embeddings
+ # (height, )
+ row_height = min(self.max_grid_row_position_embeddings, height)
+ row_position_ids = torch.arange(row_height, dtype=torch.long, device=grid.device)
+ # (height, hidden_dim)
+ row_position_embeddings = self.row_position_embeddings(row_position_ids)
+ row_shape = (1,) * (len(grid.shape) - 3) + (row_height, 1, hidden_dim)
+ # (batch_size, height, 1, hidden_dim)
+ row_position_embeddings = row_position_embeddings.view(*row_shape)
+
+ # add column-wise position embeddings
+ row_width = min(self.max_grid_col_position_embeddings, width)
+ col_position_ids = torch.arange(row_width, dtype=torch.long, device=grid.device)
+ # (width, hidden_dim)
+ col_position_embeddings = self.col_position_embeddings(col_position_ids)
+ col_shape = (batch_size, 1, row_width, hidden_dim)
+ # (batch_size, 1, width, hidden_dim)
+ col_position_embeddings = col_position_embeddings.view(*col_shape)
+ # (batch_size, height, width, hidden_dim)
+ positional_embeddings = row_position_embeddings + col_position_embeddings
+
+ # This interpolation gets triggered ONLY when the input image dim is larger in any dimenstion than the original position embeddings
+ if interpolate_pos_encoding and (
+ height > self.max_grid_row_position_embeddings or width > self.max_grid_col_position_embeddings
+ ):
+ grid = grid + self.interpolate_pos_encoding(positional_embeddings, height, width)
+ else:
+ grid = grid + positional_embeddings
+ return grid
+
+ def forward(self, grid, interpolate_pos_encoding: bool = False):
+ """
+ Args:
+ grid: Array of shape (batch_size, num_frames, height, width, num_channels).
+ It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
+ num_frames can be 1
+ interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
+ Whether to interpolate the pre-trained position encodings.
+
+ Returns:
+ embeddings: The embedding of grid with size (batch_size, height*width, num_channels)
+
+ """
+ batch_size, num_frames, height, width, num_channels = grid.shape
+ # temporal mean pooling, (batch_size, height, width, hidden_size)
+ grid = grid.mean(1)
+ grid = self.add_2d_positional_embeddings(grid, interpolate_pos_encoding=interpolate_pos_encoding)
+ # image token sequence, (batch_size, height*width, num_channels)
+ visual_tokens = grid.view(batch_size, -1, num_channels)
+ visual_tokens_shape = visual_tokens.shape[:-1]
+ device = visual_tokens.device
+
+ # image token type embeddings.
+ token_type_ids = torch.zeros(visual_tokens_shape, dtype=torch.long, device=device)
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+ embeddings = visual_tokens + token_type_embeddings
+ embeddings = self.layer_norm(embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class TvpTextInputEmbeddings(nn.Module):
+ """Construct the embeddings from word, position and token_type embeddings."""
+
+ def __init__(self, config):
+ super().__init__()
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+ if input_ids is not None:
+ input_shape = input_ids.size()
+ else:
+ input_shape = inputs_embeds.size()[:-1]
+
+ seq_length = input_shape[1]
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ if position_ids is None:
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
+ position_ids = position_ids.unsqueeze(0).expand(input_shape)
+ if token_type_ids is None:
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.word_embeddings(input_ids)
+ position_embeddings = self.position_embeddings(position_ids)
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+ embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+ embeddings = self.layer_norm(embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class TvpAttention(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+ raise ValueError(
+ f"The hidden size {config.hidden_size} is not a multiple of the number of attention heads {config.num_attention_heads}"
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
+ self.attn_dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ mask = torch.ones(self.num_attention_heads, self.attention_head_size)
+ heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads
+ for head in heads:
+ # Compute how many pruned heads are before the head and move the index accordingly
+ head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
+ mask[head] = 0
+ mask = mask.view(-1).contiguous().eq(1)
+ index = torch.arange(len(mask))[mask].long()
+
+ # Prune linear layers
+ self.query = prune_linear_layer(self.query, index)
+ self.key = prune_linear_layer(self.key, index)
+ self.value = prune_linear_layer(self.value, index)
+ self.dense = prune_linear_layer(self.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.num_attention_heads = self.num_attention_heads - len(heads)
+ self.all_head_size = self.attention_head_size * self.num_attention_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def _reshape(self, tensor: torch.Tensor, sequence_length: int, batch_size: int):
+ return (
+ tensor.view(batch_size, sequence_length, self.num_attention_heads, self.attention_head_size)
+ .transpose(1, 2)
+ .contiguous()
+ )
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ output_attentions: Optional[bool] = None,
+ ):
+ batch_size, sequence_length = hidden_states.shape[:2]
+ mixed_query_layer = self.query(hidden_states)
+
+ mixed_key_layer = self.key(hidden_states)
+ mixed_value_layer = self.value(hidden_states)
+
+ query_layer = self._reshape(mixed_query_layer, sequence_length, batch_size)
+ key_layer = self._reshape(mixed_key_layer, sequence_length, batch_size)
+ value_layer = self._reshape(mixed_value_layer, sequence_length, batch_size)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+ if attention_mask is not None:
+ attention_scores = attention_scores + attention_mask
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.attn_dropout(attention_probs)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = attention_probs * head_mask
+
+ attn_output = torch.matmul(attention_probs, value_layer)
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(batch_size, sequence_length, self.all_head_size)
+
+ attn_output = self.dense(attn_output)
+ attn_output = self.dropout(attn_output)
+ attn_output = self.layer_norm(attn_output + hidden_states)
+ # add attentions if we output them
+ outputs = (attn_output, attention_probs) if output_attentions else (attn_output,)
+ return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Tvp
+class TvpIntermediate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+class TvpOutputLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.layer_norm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class TvpEncodeLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.attention = TvpAttention(config)
+ self.intermediate = TvpIntermediate(config)
+ self.output = TvpOutputLayer(config)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ output_attentions: Optional[bool] = None,
+ ):
+ self_attention_outputs = self.attention(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ output_attentions=output_attentions,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+ outputs = (layer_output,) + outputs
+ return outputs
+
+
+class TvpEncoder(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.layer = nn.ModuleList([TvpEncodeLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ):
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ all_hidden_states = ()
+ all_attentions = ()
+
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ hidden_states,
+ attention_mask,
+ (head_mask[i] if head_mask is not None else None),
+ output_attentions,
+ )
+ else:
+ layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], output_attentions)
+
+ hidden_states = layer_outputs[0]
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ # Add last layer
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ outputs = (hidden_states,)
+ if output_hidden_states:
+ outputs = outputs + (all_hidden_states,)
+ if output_attentions:
+ outputs = outputs + (all_attentions,)
+ return outputs # last-layer hidden state, (all hidden states), (all attentions)
+
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states if output_hidden_states else None,
+ attentions=all_attentions if output_attentions else None,
+ )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Tvp
+class TvpPooler(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.activation = nn.Tanh()
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ first_token_tensor = hidden_states[:, 0]
+ pooled_output = self.dense(first_token_tensor)
+ pooled_output = self.activation(pooled_output)
+ return pooled_output
+
+
+class TvpPreTrainedModel(PreTrainedModel):
+ """An abstract class to handle weights initialization and
+ a simple interface for downloading and loading pretrained models.
+ """
+
+ config_class = TvpConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Embedding)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+ if isinstance(module, nn.Conv2d):
+ nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+ if module.bias is not None:
+ nn.init.constant_(module.bias, 0)
+
+
+TVP_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+ as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`TvpConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TVP_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+ IDs?](../glossary#input-ids)
+
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`TvpImageProcessor`]. See [`TvpImageProcessor.__call__`]
+ for details.
+
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ [What are attention masks?](../glossary#attention-mask)
+
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+ interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+ Whether to interpolate the pre-trained image pad prompter encodings and positional encodings.
+"""
+
+
+class TvpFrameDownPadPrompter(nn.Module):
+ """
+ Pad frames extracted from videos only at the bottom.
+ """
+
+ def __init__(self, config):
+ if config.visual_prompter_apply not in ("add", "replace", "remove"):
+ raise ValueError("`visual_prompter_apply` must be in (add, replace, remove)")
+
+ super().__init__()
+ self.visual_prompt_size = config.visual_prompt_size
+ self.frame_num = config.frame_num
+ self.max_img_size = config.max_img_size
+ self.visual_prompter_apply = config.visual_prompter_apply
+
+ self.pad_down = nn.Parameter(
+ torch.randn([1, config.frame_num, 3, config.visual_prompt_size, config.max_img_size])
+ )
+
+ def forward(self, pixel_values):
+ if self.visual_prompter_apply != "add":
+ visual_prompt_mask = torch.ones(
+ [self.max_img_size, self.max_img_size], dtype=pixel_values.dtype, device=pixel_values.device
+ )
+ visual_prompt_mask[self.max_img_size - self.visual_prompt_size : self.max_img_size, :] = 0.0
+ pixel_values *= visual_prompt_mask
+ if self.visual_prompter_apply != "remove":
+ prompt = torch.zeros(
+ [pixel_values.shape[0], pixel_values.shape[1], 3, self.max_img_size, self.max_img_size],
+ device=pixel_values.device,
+ )
+ start_point = self.max_img_size - self.visual_prompt_size
+ prompt[:, :, :, start_point : self.max_img_size, :] = self.pad_down
+ pixel_values += prompt.to(pixel_values.dtype)
+ return pixel_values
+
+
+class TvpFramePadPrompter(nn.Module):
+ """
+ Pad frames extracted from videos in the surroundings.
+ """
+
+ def __init__(self, config):
+ if config.visual_prompter_apply not in ("add", "replace", "remove"):
+ raise ValueError("`visual_prompter_apply` must be in (add, replace, remove)")
+
+ super().__init__()
+ self.num_frames = config.num_frames
+ self.max_img_size = config.max_img_size
+ self.visual_prompter_apply = config.visual_prompter_apply
+ self.base_size = config.max_img_size - config.visual_prompt_size * 2
+ self.pad_up = nn.Parameter(
+ torch.randn([1, config.num_frames, 3, config.visual_prompt_size, config.max_img_size])
+ )
+ self.pad_down = nn.Parameter(
+ torch.randn([1, config.num_frames, 3, config.visual_prompt_size, config.max_img_size])
+ )
+ self.pad_left = nn.Parameter(
+ torch.randn(
+ [
+ 1,
+ config.num_frames,
+ 3,
+ config.max_img_size - config.visual_prompt_size * 2,
+ config.visual_prompt_size,
+ ]
+ )
+ )
+ self.pad_right = nn.Parameter(
+ torch.randn(
+ [
+ 1,
+ config.num_frames,
+ 3,
+ config.max_img_size - config.visual_prompt_size * 2,
+ config.visual_prompt_size,
+ ]
+ )
+ )
+
+ def interpolate_pad_encoding(self, prompt: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ """
+ This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
+ resolution images (high resolution videos).
+
+ """
+
+ # creates scale factor from height and width of original image wrt to the config.max_img_size
+ h0, w0 = height / self.max_img_size, width / self.max_img_size
+
+ batch, num_frames, channels, prompt_height, prompt_width = prompt.shape
+
+ # reshaping the batch and num_frames dimension into a single one (i.e (b,frames,c,h,w)-->(b*frames,c,h,w)), to apply bicubic interpolation
+ prompt = prompt.reshape(batch * num_frames, channels, prompt_height, prompt_width)
+ prompt = nn.functional.interpolate(
+ prompt,
+ scale_factor=(h0, w0),
+ mode="bicubic",
+ align_corners=False,
+ )
+ # reversing back to (batch,frames,channels,height,width), where height and width is the new interpolated height and width
+ prompt = prompt.reshape(batch, num_frames, channels, height, width)
+ return prompt
+
+ def forward(self, pixel_values, interpolate_pad_encoding: bool = False):
+ height, width = (
+ (pixel_values.shape[-2], pixel_values.shape[-1])
+ if interpolate_pad_encoding
+ else (self.max_img_size, self.max_img_size)
+ )
+ if self.visual_prompter_apply not in ("add", "remove", "replace"):
+ raise ValueError(f"Invalid visual_prompter_apply value {self.visual_prompter_apply}")
+ if self.visual_prompter_apply in ("replace", "remove"):
+ visual_prompt_mask = torch.ones([height, width], dtype=pixel_values.dtype, device=pixel_values.device)
+ pixel_values *= visual_prompt_mask
+ if self.visual_prompter_apply in ("replace", "add"):
+ base = torch.zeros(1, self.num_frames, 3, self.base_size, self.base_size, device=pixel_values.device)
+
+ prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4)
+ prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3)
+ prompt = torch.cat(pixel_values.size(0) * [prompt])
+ if interpolate_pad_encoding:
+ prompt = self.interpolate_pad_encoding(prompt, height, width)
+ pixel_values = pixel_values + prompt.to(pixel_values.dtype)
+ return pixel_values
+
+
+TVP_PROMPTER_CLASSES_MAPPING = {
+ "framedownpad": TvpFrameDownPadPrompter,
+ "framepad": TvpFramePadPrompter,
+}
+
+
+@add_start_docstrings(
+ "The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on" " top.",
+ TVP_START_DOCSTRING,
+)
+class TvpModel(TvpPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.config = config
+ self.vision_model = TvpVisionModel(config)
+ self.embeddings = TvpTextInputEmbeddings(config)
+ self.visual_embeddings = TvpVisualInputEmbedding(config)
+ self.encoder = TvpEncoder(config)
+ self.pooler = TvpPooler(config)
+ self.text_prompt = nn.Parameter(torch.randn([1, 10, config.hidden_size]))
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ if config.visual_prompter_type not in TVP_PROMPTER_CLASSES_MAPPING:
+ raise ValueError("`visual_prompter_type` must be in (framedownpad, framepad)")
+ self.visual_prompter = TVP_PROMPTER_CLASSES_MAPPING[config.visual_prompter_type](config)
+
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embeddings.word_embeddings
+
+ def set_input_embeddings(self, value):
+ self.embeddings.word_embeddings = value
+
+ def _prune_heads(self, heads_to_prune):
+ """Prunes heads of the model.
+ heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(TVP_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=TvpConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ ):
+ r"""
+ Returns:
+
+ Examples:
+ ```python
+ >>> import torch
+ >>> from transformers import AutoConfig, AutoTokenizer, TvpModel
+
+ >>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")
+
+ >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
+ >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
+ >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+ # Add visual prompt, it compensates for the spatiotemporal information loss in 2D visual features.
+ pixel_values = self.vision_model(
+ self.visual_prompter(pixel_values, interpolate_pad_encoding=interpolate_pos_encoding)
+ )
+ # (batch_size, sequence_length, hidden_size)
+ text_embedding_output = self.embeddings(input_ids=input_ids)
+ # (batch_size, visual_sequence_length, hidden_size)
+ visual_embedding_output = self.visual_embeddings(
+ pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+ )
+
+ if attention_mask is not None:
+ # (batch_size, visual_sequence_length)
+ visual_attention_mask = attention_mask.new_ones(visual_embedding_output.shape[:2])
+ pt_mask = torch.ones(attention_mask.shape[0], 10).to(
+ device=attention_mask.device, dtype=attention_mask.dtype
+ )
+ attention_mask = torch.cat([pt_mask, attention_mask, visual_attention_mask], dim=-1)
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+ attention_mask = self.get_extended_attention_mask(attention_mask, input_ids.size()).to(input_ids.device)
+ text_prompt = self.text_prompt.expand(text_embedding_output.shape[0], -1, -1)
+ # (batch_size, sequence_length + visual_sequence_length, hidden_size)
+ embedding_output = torch.cat([text_prompt, text_embedding_output, visual_embedding_output], dim=1)
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ attention_mask=attention_mask,
+ head_mask=self.get_head_mask(head_mask, self.config.num_hidden_layers),
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ last_hidden_state = encoder_outputs.last_hidden_state if return_dict else encoder_outputs[0]
+ pooled_output = self.pooler(last_hidden_state)
+ last_hidden_state = self.dropout(last_hidden_state)
+ pooled_output = self.dropout(pooled_output)
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+class TvpVideoGroundingHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.layer_0 = nn.Linear(config.hidden_size, config.hidden_size * 2)
+ self.layer_1 = nn.Linear(config.hidden_size * 2, 2)
+ self.activation_0 = nn.ReLU()
+ self.activation_1 = nn.Sigmoid()
+
+ def forward(self, pooler_output):
+ logits = self.activation_0(self.layer_0(pooler_output))
+ logits = self.activation_1(self.layer_1(logits))
+ return logits
+
+
+@add_start_docstrings(
+ """
+ Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
+ """,
+ TVP_START_DOCSTRING,
+)
+class TvpForVideoGrounding(TvpPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.config = config
+ self.model = TvpModel(config)
+ self.video_grounding_head = TvpVideoGroundingHead(config)
+
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(TVP_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TvpVideoGroundingOutput, config_class=TvpConfig)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.LongTensor] = None,
+ labels: Tuple[torch.Tensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ ):
+ r"""
+ labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
+ The labels contains duration, start time, and end time of the video corresponding to the text.
+ Returns:
+
+ Examples:
+ ```python
+ >>> import torch
+ >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding
+
+ >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")
+
+ >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
+ >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
+ >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+ outputs = self.model(
+ input_ids,
+ pixel_values,
+ attention_mask,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ )
+ pooler_output = outputs[1]
+ logits = self.video_grounding_head(pooler_output)
+
+ loss = None
+ if labels is not None:
+ criterion = TvpLoss(["iou", "distance", "duration"])
+ criterion.to(self.device)
+ loss_dict = criterion(logits, labels)
+ loss = (
+ loss_dict["iou"]
+ + self.config.distance_loss_weight * loss_dict["distance"]
+ + self.config.duration_loss_weight * loss_dict["duration"]
+ )
+ if not return_dict:
+ outputs = (logits,) + outputs[2:]
+ if loss is not None:
+ outputs = (loss,) + outputs
+ return outputs
+
+ return TvpVideoGroundingOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = ["TvpModel", "TvpPreTrainedModel", "TvpForVideoGrounding"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/tvp/processing_tvp.py b/.venv/lib/python3.11/site-packages/transformers/models/tvp/processing_tvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4ed81e54aaddbc0b19dab605ad526a6a41d07ce
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/tvp/processing_tvp.py
@@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for TVP.
+"""
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+
+
+class TvpProcessor(ProcessorMixin):
+ r"""
+ Constructs an TVP processor which wraps a TVP image processor and a Bert tokenizer into a single processor.
+
+ [`TvpProcessor`] offers all the functionalities of [`TvpImageProcessor`] and [`BertTokenizerFast`]. See the
+ [`~TvpProcessor.__call__`] and [`~TvpProcessor.decode`] for more information.
+
+ Args:
+ image_processor ([`TvpImageProcessor`], *optional*):
+ The image processor is a required input.
+ tokenizer ([`BertTokenizerFast`], *optional*):
+ The tokenizer is a required input.
+ """
+
+ attributes = ["image_processor", "tokenizer"]
+ image_processor_class = "TvpImageProcessor"
+ tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+
+ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+ if image_processor is None:
+ raise ValueError("You need to specify an `image_processor`.")
+ if tokenizer is None:
+ raise ValueError("You need to specify a `tokenizer`.")
+
+ super().__init__(image_processor, tokenizer)
+
+ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
+ """
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+ and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
+ the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
+ TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the doctsring of
+ the above two methods for more information.
+
+ Args:
+ text (`str`, `List[str]`, `List[List[str]]`):
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+ videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
+ `List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
+ of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
+ each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
+ channels.
+
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
+ If set, will return tensors of a particular framework. Acceptable values are:
+
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+ Returns:
+ [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+ `None`).
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
+ """
+
+ max_text_length = kwargs.pop("max_text_length", None)
+
+ if text is None and videos is None:
+ raise ValueError("You have to specify either text or videos. Both cannot be none.")
+
+ encoding = {}
+ if text is not None:
+ textual_input = self.tokenizer.batch_encode_plus(
+ text,
+ truncation=True,
+ padding="max_length",
+ max_length=max_text_length,
+ pad_to_max_length=True,
+ return_tensors=return_tensors,
+ return_token_type_ids=False,
+ **kwargs,
+ )
+ encoding.update(textual_input)
+
+ if videos is not None:
+ image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
+ encoding.update(image_features)
+
+ return BatchEncoding(data=encoding, tensor_type=return_tensors)
+
+ def batch_decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, **kwargs)
+
+ def decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, **kwargs)
+
+ def post_process_video_grounding(self, logits, video_durations):
+ """
+ Compute the time of the video.
+
+ Args:
+ logits (`torch.Tensor`):
+ The logits output of TvpForVideoGrounding.
+ video_durations (`float`):
+ The video's duration.
+
+ Returns:
+ start (`float`):
+ The start time of the video.
+ end (`float`):
+ The end time of the video.
+ """
+ start, end = (
+ round(logits.tolist()[0][0] * video_durations, 1),
+ round(logits.tolist()[0][1] * video_durations, 1),
+ )
+
+ return start, end
+
+ @property
+ # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
+ def model_input_names(self):
+ tokenizer_input_names = self.tokenizer.model_input_names
+ image_processor_input_names = self.image_processor.model_input_names
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["TvpProcessor"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b7f6e1bf1cb09f446df5cd8aada2e0ee942cbc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_videomae import *
+ from .feature_extraction_videomae import *
+ from .image_processing_videomae import *
+ from .modeling_videomae import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c6b107e8de9a1788b8455b3670a2208e0a0a860
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/configuration_videomae.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/configuration_videomae.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f631fc0802551111bf040a7acef745505944f90
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/configuration_videomae.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/feature_extraction_videomae.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/feature_extraction_videomae.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18aa27d41114ec6d9281a1c93f445545f483da2a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/feature_extraction_videomae.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/image_processing_videomae.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/image_processing_videomae.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41396d2270b94d920cd24fbef725562163283075
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/image_processing_videomae.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/modeling_videomae.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/modeling_videomae.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0333b84ca48b6181f3e360f614eddbd6f26c44b2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/videomae/__pycache__/modeling_videomae.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/configuration_videomae.py b/.venv/lib/python3.11/site-packages/transformers/models/videomae/configuration_videomae.py
new file mode 100644
index 0000000000000000000000000000000000000000..3940b6f010036cb3b3957f2fad94fed50737c85d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/videomae/configuration_videomae.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VideoMAE model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class VideoMAEConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`VideoMAEModel`]. It is used to instantiate a
+ VideoMAE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the VideoMAE
+ [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 16):
+ The size (resolution) of each patch.
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ num_frames (`int`, *optional*, defaults to 16):
+ The number of frames in each video.
+ tubelet_size (`int`, *optional*, defaults to 2):
+ The number of tubelets.
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the layer normalization layers.
+ qkv_bias (`bool`, *optional*, defaults to `True`):
+ Whether to add a bias to the queries, keys and values.
+ use_mean_pooling (`bool`, *optional*, defaults to `True`):
+ Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token.
+ decoder_num_attention_heads (`int`, *optional*, defaults to 6):
+ Number of attention heads for each attention layer in the decoder.
+ decoder_hidden_size (`int`, *optional*, defaults to 384):
+ Dimensionality of the decoder.
+ decoder_num_hidden_layers (`int`, *optional*, defaults to 4):
+ Number of hidden layers in the decoder.
+ decoder_intermediate_size (`int`, *optional*, defaults to 1536):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+ norm_pix_loss (`bool`, *optional*, defaults to `True`):
+ Whether to normalize the target patch pixels.
+
+ Example:
+
+ ```python
+ >>> from transformers import VideoMAEConfig, VideoMAEModel
+
+ >>> # Initializing a VideoMAE videomae-base style configuration
+ >>> configuration = VideoMAEConfig()
+
+ >>> # Randomly initializing a model from the configuration
+ >>> model = VideoMAEModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "videomae"
+
+ def __init__(
+ self,
+ image_size=224,
+ patch_size=16,
+ num_channels=3,
+ num_frames=16,
+ tubelet_size=2,
+ hidden_size=768,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ intermediate_size=3072,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.0,
+ attention_probs_dropout_prob=0.0,
+ initializer_range=0.02,
+ layer_norm_eps=1e-12,
+ qkv_bias=True,
+ use_mean_pooling=True,
+ decoder_num_attention_heads=6,
+ decoder_hidden_size=384,
+ decoder_num_hidden_layers=4,
+ decoder_intermediate_size=1536,
+ norm_pix_loss=True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.num_frames = num_frames
+ self.tubelet_size = tubelet_size
+
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.initializer_range = initializer_range
+ self.layer_norm_eps = layer_norm_eps
+ self.qkv_bias = qkv_bias
+ self.use_mean_pooling = use_mean_pooling
+
+ self.decoder_num_attention_heads = decoder_num_attention_heads
+ self.decoder_hidden_size = decoder_hidden_size
+ self.decoder_num_hidden_layers = decoder_num_hidden_layers
+ self.decoder_intermediate_size = decoder_intermediate_size
+ self.norm_pix_loss = norm_pix_loss
+
+
+__all__ = ["VideoMAEConfig"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/feature_extraction_videomae.py b/.venv/lib/python3.11/site-packages/transformers/models/videomae/feature_extraction_videomae.py
new file mode 100644
index 0000000000000000000000000000000000000000..469cbcf523bd46a2f7a6d1dd8a70d5e05fe566e6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/videomae/feature_extraction_videomae.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for VideoMAE."""
+
+import warnings
+
+from ...utils import logging
+from .image_processing_videomae import VideoMAEImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+class VideoMAEFeatureExtractor(VideoMAEImageProcessor):
+ def __init__(self, *args, **kwargs) -> None:
+ warnings.warn(
+ "The class VideoMAEFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+ " Please use VideoMAEImageProcessor instead.",
+ FutureWarning,
+ )
+ super().__init__(*args, **kwargs)
+
+
+__all__ = ["VideoMAEFeatureExtractor"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/image_processing_videomae.py b/.venv/lib/python3.11/site-packages/transformers/models/videomae/image_processing_videomae.py
new file mode 100644
index 0000000000000000000000000000000000000000..afba947bbdbfcedd39ea0ee3bb10205356dbffd5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/videomae/image_processing_videomae.py
@@ -0,0 +1,348 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for VideoMAE."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+ get_resize_output_image_size,
+ resize,
+ to_channel_dimension_format,
+)
+from ...image_utils import (
+ IMAGENET_STANDARD_MEAN,
+ IMAGENET_STANDARD_STD,
+ ChannelDimension,
+ ImageInput,
+ PILImageResampling,
+ infer_channel_dimension_format,
+ is_scaled_image,
+ is_valid_image,
+ to_numpy_array,
+ valid_images,
+ validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+if is_vision_available():
+ import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched(videos) -> List[List[ImageInput]]:
+ if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+ return videos
+
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+ return [videos]
+
+ elif is_valid_image(videos):
+ return [[videos]]
+
+ raise ValueError(f"Could not make batched video from {videos}")
+
+
+class VideoMAEImageProcessor(BaseImageProcessor):
+ r"""
+ Constructs a VideoMAE image processor.
+
+ Args:
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+ `do_resize` parameter in the `preprocess` method.
+ size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+ Size of the output image after resizing. The shortest edge of the image will be resized to
+ `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
+ `size` in the `preprocess` method.
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+ Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+ `preprocess` method.
+ do_center_crop (`bool`, *optional*, defaults to `True`):
+ Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+ parameter in the `preprocess` method.
+ crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+ Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+ `preprocess` method.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+ parameter in the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+ in the `preprocess` method.
+ do_normalize (`bool`, *optional*, defaults to `True`):
+ Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+ method.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ """
+
+ model_input_names = ["pixel_values"]
+
+ def __init__(
+ self,
+ do_resize: bool = True,
+ size: Dict[str, int] = None,
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
+ do_center_crop: bool = True,
+ crop_size: Dict[str, int] = None,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_normalize: bool = True,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ **kwargs,
+ ) -> None:
+ super().__init__(**kwargs)
+ size = size if size is not None else {"shortest_edge": 224}
+ size = get_size_dict(size, default_to_square=False)
+ crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+ crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+ self.do_resize = do_resize
+ self.size = size
+ self.do_center_crop = do_center_crop
+ self.crop_size = crop_size
+ self.resample = resample
+ self.do_rescale = do_rescale
+ self.rescale_factor = rescale_factor
+ self.do_normalize = do_normalize
+ self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+ self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+ def resize(
+ self,
+ image: np.ndarray,
+ size: Dict[str, int],
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Resize an image.
+
+ Args:
+ image (`np.ndarray`):
+ Image to resize.
+ size (`Dict[str, int]`):
+ Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+ have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
+ shortest edge of length `s` while keeping the aspect ratio of the original image.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+ Resampling filter to use when resiizing the image.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format of the image. If not provided, it will be the same as the input image.
+ input_data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred.
+ """
+ size = get_size_dict(size, default_to_square=False)
+ if "shortest_edge" in size:
+ output_size = get_resize_output_image_size(
+ image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+ )
+ elif "height" in size and "width" in size:
+ output_size = (size["height"], size["width"])
+ else:
+ raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
+ return resize(
+ image,
+ size=output_size,
+ resample=resample,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ **kwargs,
+ )
+
+ def _preprocess_image(
+ self,
+ image: ImageInput,
+ do_resize: bool = None,
+ size: Dict[str, int] = None,
+ resample: PILImageResampling = None,
+ do_center_crop: bool = None,
+ crop_size: Dict[str, int] = None,
+ do_rescale: bool = None,
+ rescale_factor: float = None,
+ do_normalize: bool = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> np.ndarray:
+ """Preprocesses a single image."""
+ validate_preprocess_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_center_crop=do_center_crop,
+ crop_size=crop_size,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ )
+
+ # All transformations expect numpy arrays.
+ image = to_numpy_array(image)
+
+ if do_rescale and is_scaled_image(image):
+ logger.warning_once(
+ "It looks like you are trying to rescale already rescaled images. If the input"
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+ )
+
+ if input_data_format is None:
+ input_data_format = infer_channel_dimension_format(image)
+
+ if do_resize:
+ image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+ if do_center_crop:
+ image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
+
+ if do_rescale:
+ image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+ if do_normalize:
+ image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+ return image
+
+ @filter_out_non_signature_kwargs()
+ def preprocess(
+ self,
+ videos: ImageInput,
+ do_resize: bool = None,
+ size: Dict[str, int] = None,
+ resample: PILImageResampling = None,
+ do_center_crop: bool = None,
+ crop_size: Dict[str, int] = None,
+ do_rescale: bool = None,
+ rescale_factor: float = None,
+ do_normalize: bool = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ data_format: ChannelDimension = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> PIL.Image.Image:
+ """
+ Preprocess an image or batch of images.
+
+ Args:
+ images (`ImageInput`):
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+ Whether to resize the image.
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+ Size of the image after applying resize.
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+ has an effect if `do_resize` is set to `True`.
+ do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+ Whether to centre crop the image.
+ crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+ Size of the image after applying the centre crop.
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+ Whether to rescale the image values between [0 - 1].
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+ Whether to normalize the image.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+ Image mean.
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+ Image standard deviation.
+ return_tensors (`str` or `TensorType`, *optional*):
+ The type of tensors to return. Can be one of:
+ - Unset: Return a list of `np.ndarray`.
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the inferred channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ """
+ do_resize = do_resize if do_resize is not None else self.do_resize
+ resample = resample if resample is not None else self.resample
+ do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+ image_mean = image_mean if image_mean is not None else self.image_mean
+ image_std = image_std if image_std is not None else self.image_std
+
+ size = size if size is not None else self.size
+ size = get_size_dict(size, default_to_square=False)
+ crop_size = crop_size if crop_size is not None else self.crop_size
+ crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+ if not valid_images(videos):
+ raise ValueError(
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+ "torch.Tensor, tf.Tensor or jax.ndarray."
+ )
+
+ videos = make_batched(videos)
+
+ videos = [
+ [
+ self._preprocess_image(
+ image=img,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ do_center_crop=do_center_crop,
+ crop_size=crop_size,
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ )
+ for img in video
+ ]
+ for video in videos
+ ]
+
+ data = {"pixel_values": videos}
+ return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["VideoMAEImageProcessor"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/videomae/modeling_videomae.py b/.venv/lib/python3.11/site-packages/transformers/models/videomae/modeling_videomae.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e65ebf06d9c5051d6ae93544799cf8d80256277
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/videomae/modeling_videomae.py
@@ -0,0 +1,1138 @@
+# coding=utf-8
+# Copyright 2022 Multimedia Computing Group, Nanjing University and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch VideoMAE (masked autoencoder) model."""
+
+import collections.abc
+import math
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional, Set, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from ...utils.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .configuration_videomae import VideoMAEConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "VideoMAEConfig"
+_CHECKPOINT_FOR_DOC = "MCG-NJU/videomae-base"
+
+
+@dataclass
+class VideoMAEDecoderOutput(ModelOutput):
+ """
+ Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.
+
+ Args:
+ logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
+ Pixel reconstruction logits.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+ plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ the self-attention heads.
+ """
+
+ logits: torch.FloatTensor = None
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class VideoMAEForPreTrainingOutput(ModelOutput):
+ """
+ Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.
+
+ Args:
+ loss (`torch.FloatTensor` of shape `(1,)`):
+ Pixel reconstruction loss.
+ logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
+ Pixel reconstruction logits.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+ plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ the self-attention heads.
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ logits: torch.FloatTensor = None
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# sin-cos position encoding
+# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
+def get_sinusoid_encoding_table(n_position, d_hid):
+ """Sinusoid position encoding table"""
+
+ # TODO: make it with torch instead of numpy
+ def get_position_angle_vec(position):
+ return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+ sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+ sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+ sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+
+ return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+
+class VideoMAEEmbeddings(nn.Module):
+ """
+ Construct the patch and position embeddings.
+
+ """
+
+ def __init__(self, config):
+ super().__init__()
+
+ self.patch_embeddings = VideoMAEPatchEmbeddings(config)
+ self.num_patches = self.patch_embeddings.num_patches
+ # fixed sin-cos embedding
+ self.position_embeddings = get_sinusoid_encoding_table(self.num_patches, config.hidden_size)
+ self.config = config
+
+ def forward(self, pixel_values, bool_masked_pos):
+ # create patch embeddings
+ embeddings = self.patch_embeddings(pixel_values)
+
+ # add position embeddings
+ embeddings = embeddings + self.position_embeddings.type_as(embeddings).to(embeddings.device).clone().detach()
+ # only keep visible patches
+ # ~bool_masked_pos means visible
+ if bool_masked_pos is not None:
+ batch_size, _, num_channels = embeddings.shape
+ embeddings = embeddings[~bool_masked_pos]
+ embeddings = embeddings.reshape(batch_size, -1, num_channels)
+
+ return embeddings
+
+
+class VideoMAEPatchEmbeddings(nn.Module):
+ """
+ Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
+ height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.
+
+ The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
+ patch_size).
+
+ """
+
+ def __init__(self, config):
+ super().__init__()
+
+ image_size = config.image_size
+ patch_size = config.patch_size
+ num_channels = config.num_channels
+ hidden_size = config.hidden_size
+ num_frames = config.num_frames
+ tubelet_size = config.tubelet_size
+
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.tubelet_size = int(tubelet_size)
+ num_patches = (
+ (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
+ )
+ self.num_channels = num_channels
+ self.num_patches = num_patches
+ self.projection = nn.Conv3d(
+ in_channels=num_channels,
+ out_channels=hidden_size,
+ kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
+ stride=(self.tubelet_size, patch_size[0], patch_size[1]),
+ )
+
+ def forward(self, pixel_values):
+ batch_size, num_frames, num_channels, height, width = pixel_values.shape
+ if num_channels != self.num_channels:
+ raise ValueError(
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+ )
+ if height != self.image_size[0] or width != self.image_size[1]:
+ raise ValueError(
+ f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+ )
+ # permute to (batch_size, num_channels, num_frames, height, width)
+ pixel_values = pixel_values.permute(0, 2, 1, 3, 4)
+ embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+ return embeddings
+
+
+class VideoMAESelfAttention(nn.Module):
+ def __init__(self, config: VideoMAEConfig) -> None:
+ super().__init__()
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+ raise ValueError(
+ f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+ f"heads {config.num_attention_heads}."
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+ self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+
+ if config.qkv_bias:
+ self.q_bias = nn.Parameter(torch.zeros(self.all_head_size))
+ self.v_bias = nn.Parameter(torch.zeros(self.all_head_size))
+ else:
+ self.q_bias = None
+ self.v_bias = None
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+ x = x.view(new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None
+ keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias)
+ values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias)
+ queries = nn.functional.linear(input=hidden_states, weight=self.query.weight, bias=self.q_bias)
+
+ key_layer = self.transpose_for_scores(keys)
+ value_layer = self.transpose_for_scores(values)
+ query_layer = self.transpose_for_scores(queries)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(attention_probs)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = attention_probs * head_mask
+
+ context_layer = torch.matmul(attention_probs, value_layer)
+
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(new_context_layer_shape)
+
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+ return outputs
+
+
+class VideoMAESdpaSelfAttention(VideoMAESelfAttention):
+ def __init__(self, config: VideoMAEConfig) -> None:
+ super().__init__(config)
+ self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+
+ def forward(
+ self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None
+ keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias)
+ values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias)
+ queries = nn.functional.linear(input=hidden_states, weight=self.query.weight, bias=self.q_bias)
+
+ key_layer = self.transpose_for_scores(keys)
+ value_layer = self.transpose_for_scores(values)
+ query_layer = self.transpose_for_scores(queries)
+
+ context_layer = torch.nn.functional.scaled_dot_product_attention(
+ query_layer,
+ key_layer,
+ value_layer,
+ head_mask,
+ self.attention_probs_dropout_prob if self.training else 0.0,
+ is_causal=False,
+ scale=None,
+ )
+
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(new_context_layer_shape)
+
+ return context_layer, None
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->VideoMAE
+class VideoMAESelfOutput(nn.Module):
+ """
+ The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
+ layernorm applied before each block.
+ """
+
+ def __init__(self, config: VideoMAEConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->VideoMAE
+class VideoMAEAttention(nn.Module):
+ def __init__(self, config: VideoMAEConfig) -> None:
+ super().__init__()
+ self.attention = VideoMAESelfAttention(config)
+ self.output = VideoMAESelfOutput(config)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads: Set[int]) -> None:
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+ )
+
+ # Prune linear layers
+ self.attention.query = prune_linear_layer(self.attention.query, index)
+ self.attention.key = prune_linear_layer(self.attention.key, index)
+ self.attention.value = prune_linear_layer(self.attention.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+ self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+ attention_output = self.output(self_outputs[0], hidden_states)
+
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+ return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->VideoMAE
+class VideoMAESdpaAttention(VideoMAEAttention):
+ def __init__(self, config: VideoMAEConfig) -> None:
+ super().__init__(config)
+ self.attention = VideoMAESdpaSelfAttention(config)
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate ViT->VideoMAE
+class VideoMAEIntermediate(nn.Module):
+ def __init__(self, config: VideoMAEConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+
+ return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput ViT->VideoMAE
+class VideoMAEOutput(nn.Module):
+ def __init__(self, config: VideoMAEConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ hidden_states = hidden_states + input_tensor
+
+ return hidden_states
+
+
+VIDEOMAE_ATTENTION_CLASSES = {"eager": VideoMAEAttention, "sdpa": VideoMAESdpaAttention}
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->VideoMAE,VIT->VIDEOMAE
+class VideoMAELayer(nn.Module):
+ """This corresponds to the Block class in the timm implementation."""
+
+ def __init__(self, config: VideoMAEConfig) -> None:
+ super().__init__()
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ self.seq_len_dim = 1
+ self.attention = VIDEOMAE_ATTENTION_CLASSES[config._attn_implementation](config)
+ self.intermediate = VideoMAEIntermediate(config)
+ self.output = VideoMAEOutput(config)
+ self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ self_attention_outputs = self.attention(
+ self.layernorm_before(hidden_states), # in VideoMAE, layernorm is applied before self-attention
+ head_mask,
+ output_attentions=output_attentions,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
+
+ # first residual connection
+ hidden_states = attention_output + hidden_states
+
+ # in VideoMAE, layernorm is also applied after self-attention
+ layer_output = self.layernorm_after(hidden_states)
+ layer_output = self.intermediate(layer_output)
+
+ # second residual connection is done here
+ layer_output = self.output(layer_output, hidden_states)
+
+ outputs = (layer_output,) + outputs
+
+ return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->VideoMAE
+class VideoMAEEncoder(nn.Module):
+ def __init__(self, config: VideoMAEConfig) -> None:
+ super().__init__()
+ self.config = config
+ self.layer = nn.ModuleList([VideoMAELayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ ) -> Union[tuple, BaseModelOutput]:
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ hidden_states,
+ layer_head_mask,
+ output_attentions,
+ )
+ else:
+ layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+class VideoMAEPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = VideoMAEConfig
+ base_model_prefix = "videomae"
+ main_input_name = "pixel_values"
+ supports_gradient_checkpointing = True
+ _supports_sdpa = True
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Conv3d)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+
+VIDEOMAE_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+ as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`VideoMAEConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIDEOMAE_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+ [`VideoMAEImageProcessor.__call__`] for details.
+
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare VideoMAE Model transformer outputting raw hidden-states without any specific head on top.",
+ VIDEOMAE_START_DOCSTRING,
+)
+class VideoMAEModel(VideoMAEPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.config = config
+
+ self.embeddings = VideoMAEEmbeddings(config)
+ self.encoder = VideoMAEEncoder(config)
+
+ if config.use_mean_pooling:
+ self.layernorm = None
+ else:
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embeddings.patch_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: torch.FloatTensor,
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
+ batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
+ length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> import av
+ >>> import numpy as np
+
+ >>> from transformers import AutoImageProcessor, VideoMAEModel
+ >>> from huggingface_hub import hf_hub_download
+
+ >>> np.random.seed(0)
+
+
+ >>> def read_video_pyav(container, indices):
+ ... '''
+ ... Decode the video with PyAV decoder.
+ ... Args:
+ ... container (`av.container.input.InputContainer`): PyAV container.
+ ... indices (`List[int]`): List of frame indices to decode.
+ ... Returns:
+ ... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+ ... '''
+ ... frames = []
+ ... container.seek(0)
+ ... start_index = indices[0]
+ ... end_index = indices[-1]
+ ... for i, frame in enumerate(container.decode(video=0)):
+ ... if i > end_index:
+ ... break
+ ... if i >= start_index and i in indices:
+ ... frames.append(frame)
+ ... return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+ >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+ ... '''
+ ... Sample a given number of frame indices from the video.
+ ... Args:
+ ... clip_len (`int`): Total number of frames to sample.
+ ... frame_sample_rate (`int`): Sample every n-th frame.
+ ... seg_len (`int`): Maximum allowed index of sample's last frame.
+ ... Returns:
+ ... indices (`List[int]`): List of sampled frame indices
+ ... '''
+ ... converted_len = int(clip_len * frame_sample_rate)
+ ... end_idx = np.random.randint(converted_len, seg_len)
+ ... start_idx = end_idx - converted_len
+ ... indices = np.linspace(start_idx, end_idx, num=clip_len)
+ ... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+ ... return indices
+
+
+ >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+ >>> file_path = hf_hub_download(
+ ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+ ... )
+ >>> container = av.open(file_path)
+
+ >>> # sample 16 frames
+ >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+ >>> video = read_video_pyav(container, indices)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
+ >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")
+
+ >>> # prepare video for the model
+ >>> inputs = image_processor(list(video), return_tensors="pt")
+
+ >>> # forward pass
+ >>> outputs = model(**inputs)
+ >>> last_hidden_states = outputs.last_hidden_state
+ >>> list(last_hidden_states.shape)
+ [1, 1568, 768]
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+ embedding_output = self.embeddings(pixel_values, bool_masked_pos)
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ sequence_output = encoder_outputs[0]
+ if self.layernorm is not None:
+ sequence_output = self.layernorm(sequence_output)
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[1:]
+
+ return BaseModelOutput(
+ last_hidden_state=sequence_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+class VideoMAEDecoder(nn.Module):
+ def __init__(self, config, num_patches):
+ super().__init__()
+
+ decoder_num_labels = config.num_channels * config.tubelet_size * config.patch_size**2
+
+ decoder_config = deepcopy(config)
+ decoder_config.hidden_size = config.decoder_hidden_size
+ decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
+ decoder_config.num_attention_heads = config.decoder_num_attention_heads
+ decoder_config.intermediate_size = config.decoder_intermediate_size
+ self.decoder_layers = nn.ModuleList(
+ [VideoMAELayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
+ )
+
+ self.norm = nn.LayerNorm(config.decoder_hidden_size)
+ self.head = (
+ nn.Linear(config.decoder_hidden_size, decoder_num_labels) if decoder_num_labels > 0 else nn.Identity()
+ )
+
+ self.gradient_checkpointing = False
+ self.config = config
+
+ def forward(
+ self,
+ hidden_states,
+ return_token_num,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=True,
+ ):
+ # apply Transformer layers (blocks)
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+ for i, layer_module in enumerate(self.decoder_layers):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ hidden_states,
+ None,
+ output_attentions,
+ )
+ else:
+ layer_outputs = layer_module(hidden_states, head_mask=None, output_attentions=output_attentions)
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if return_token_num > 0:
+ hidden_states = hidden_states[:, -return_token_num:]
+
+ # predictor projection
+ hidden_states = self.norm(hidden_states)
+ logits = self.head(hidden_states)
+
+ if not return_dict:
+ return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
+ return VideoMAEDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
+
+
+@add_start_docstrings(
+ "The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.",
+ VIDEOMAE_START_DOCSTRING,
+)
+class VideoMAEForPreTraining(VideoMAEPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.config = config
+
+ self.videomae = VideoMAEModel(config)
+
+ self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=False)
+ self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+ self.position_embeddings = get_sinusoid_encoding_table(
+ self.videomae.embeddings.num_patches, config.decoder_hidden_size
+ )
+
+ self.decoder = VideoMAEDecoder(config, num_patches=self.videomae.embeddings.num_patches)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=VideoMAEForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: torch.FloatTensor,
+ bool_masked_pos: torch.BoolTensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[tuple, VideoMAEForPreTrainingOutput]:
+ r"""
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
+ batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
+ (image_size // patch_size) ** 2`.
+
+ Returns:
+
+ Examples:
+ ```python
+ >>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
+ >>> import numpy as np
+ >>> import torch
+
+ >>> num_frames = 16
+ >>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
+ >>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")
+
+ >>> pixel_values = image_processor(video, return_tensors="pt").pixel_values
+
+ >>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
+ >>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
+ >>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()
+
+ >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+ >>> loss = outputs.loss
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.videomae(
+ pixel_values,
+ bool_masked_pos=bool_masked_pos,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+ sequence_output = self.encoder_to_decoder(
+ sequence_output
+ ) # [batch_size, num_visible_patches, decoder_hidden_size]
+ batch_size, seq_len, num_channels = sequence_output.shape
+
+ # we don't unshuffle the correct visible token order, but shuffle the position embeddings accordingly.
+ if bool_masked_pos is None:
+ raise ValueError("One must provided a boolean mask ")
+ expanded_position_embeddings = self.position_embeddings.expand(batch_size, -1, -1).type_as(pixel_values)
+ expanded_position_embeddings = expanded_position_embeddings.to(pixel_values.device).clone().detach()
+ pos_emb_visible = expanded_position_embeddings[~bool_masked_pos].reshape(batch_size, -1, num_channels)
+ pos_emb_mask = expanded_position_embeddings[bool_masked_pos].reshape(batch_size, -1, num_channels)
+
+ # [batch_size, num_patches, decoder_hidden_size]
+ x_full = torch.cat([sequence_output + pos_emb_visible, self.mask_token + pos_emb_mask], dim=1)
+
+ # [batch_size, num_masked_patches, num_channels * patch_size * patch_size]
+ decoder_outputs = self.decoder(x_full, pos_emb_mask.shape[1])
+ logits = decoder_outputs.logits
+
+ loss = None
+ with torch.no_grad():
+ # calculate the labels to be predicted
+ if self.config.num_channels != 3:
+ # Can't unnormalize with default means/stds
+ frames = pixel_values
+ else:
+ # first, unnormalize the frames
+ device = pixel_values.device
+ dtype = pixel_values.dtype
+ mean = torch.as_tensor(IMAGENET_DEFAULT_MEAN).to(device=device, dtype=dtype)[None, None, :, None, None]
+ std = torch.as_tensor(IMAGENET_DEFAULT_STD).to(device=device, dtype=dtype)[None, None, :, None, None]
+ frames = pixel_values * std + mean # in [0, 1]
+
+ batch_size, time, num_channels, height, width = frames.shape
+ tubelet_size, patch_size = self.config.tubelet_size, self.config.patch_size
+ if self.config.norm_pix_loss:
+ # step 1: split up dimensions (time by tubelet_size, height by patch_size, width by patch_size)
+ frames = frames.view(
+ batch_size,
+ time // tubelet_size,
+ tubelet_size,
+ num_channels,
+ height // patch_size,
+ patch_size,
+ width // patch_size,
+ patch_size,
+ )
+ # step 2: move dimensions to concatenate:
+ frames = frames.permute(0, 1, 4, 6, 2, 5, 7, 3).contiguous()
+ # step 3: concatenate:
+ frames = frames.view(
+ batch_size,
+ time // tubelet_size * height // patch_size * width // patch_size,
+ tubelet_size * patch_size * patch_size,
+ num_channels,
+ )
+ # step 4: normalize. The authors find that the mean is about 0.48 and standard deviation is about 0.08.
+ frames_norm = (frames - frames.mean(dim=-2, keepdim=True)) / (
+ frames.var(dim=-2, unbiased=True, keepdim=True).sqrt() + 1e-6
+ )
+ # step 5: reshape to (batch_size, T//ts * H//ps * W//ps, ts * ps * ps * C)
+ videos_patch = frames_norm.view(
+ batch_size,
+ time // tubelet_size * height // patch_size * width // patch_size,
+ tubelet_size * patch_size * patch_size * num_channels,
+ )
+ else:
+ if self.config.num_channels != 3:
+ raise ValueError(
+ "Can't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False."
+ )
+ # step 1: split up dimensions (time by tubelet_size, height by patch_size, width by patch_size)
+ frames = frames.view(
+ batch_size,
+ time // tubelet_size,
+ tubelet_size,
+ num_channels,
+ height // patch_size,
+ patch_size,
+ width // patch_size,
+ patch_size,
+ )
+ # step 2: move dimensions to concatenate: (batch_size, T//ts, H//ps, W//ps, ts, ps, ps, C)
+ frames = frames.permute(0, 1, 4, 6, 2, 5, 7, 3).contiguous()
+ # step 3: concatenate
+ videos_patch = frames.view(
+ batch_size,
+ time // tubelet_size * height // patch_size * width // patch_size,
+ tubelet_size * patch_size * patch_size * num_channels,
+ )
+
+ batch_size, _, num_channels = videos_patch.shape
+ labels = videos_patch[bool_masked_pos].reshape(batch_size, -1, num_channels)
+
+ loss_fct = MSELoss()
+ loss = loss_fct(logits, labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return VideoMAEForPreTrainingOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
+ states of all tokens) e.g. for ImageNet.""",
+ VIDEOMAE_START_DOCSTRING,
+)
+class VideoMAEForVideoClassification(VideoMAEPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.videomae = VideoMAEModel(config)
+
+ # Classifier head
+ self.fc_norm = nn.LayerNorm(config.hidden_size) if config.use_mean_pooling else None
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, ImageClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> import av
+ >>> import torch
+ >>> import numpy as np
+
+ >>> from transformers import AutoImageProcessor, VideoMAEForVideoClassification
+ >>> from huggingface_hub import hf_hub_download
+
+ >>> np.random.seed(0)
+
+
+ >>> def read_video_pyav(container, indices):
+ ... '''
+ ... Decode the video with PyAV decoder.
+ ... Args:
+ ... container (`av.container.input.InputContainer`): PyAV container.
+ ... indices (`List[int]`): List of frame indices to decode.
+ ... Returns:
+ ... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+ ... '''
+ ... frames = []
+ ... container.seek(0)
+ ... start_index = indices[0]
+ ... end_index = indices[-1]
+ ... for i, frame in enumerate(container.decode(video=0)):
+ ... if i > end_index:
+ ... break
+ ... if i >= start_index and i in indices:
+ ... frames.append(frame)
+ ... return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+ >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+ ... '''
+ ... Sample a given number of frame indices from the video.
+ ... Args:
+ ... clip_len (`int`): Total number of frames to sample.
+ ... frame_sample_rate (`int`): Sample every n-th frame.
+ ... seg_len (`int`): Maximum allowed index of sample's last frame.
+ ... Returns:
+ ... indices (`List[int]`): List of sampled frame indices
+ ... '''
+ ... converted_len = int(clip_len * frame_sample_rate)
+ ... end_idx = np.random.randint(converted_len, seg_len)
+ ... start_idx = end_idx - converted_len
+ ... indices = np.linspace(start_idx, end_idx, num=clip_len)
+ ... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+ ... return indices
+
+
+ >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+ >>> file_path = hf_hub_download(
+ ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+ ... )
+ >>> container = av.open(file_path)
+
+ >>> # sample 16 frames
+ >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+ >>> video = read_video_pyav(container, indices)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
+ >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
+
+ >>> inputs = image_processor(list(video), return_tensors="pt")
+
+ >>> with torch.no_grad():
+ ... outputs = model(**inputs)
+ ... logits = outputs.logits
+
+ >>> # model predicts one of the 400 Kinetics-400 classes
+ >>> predicted_label = logits.argmax(-1).item()
+ >>> print(model.config.id2label[predicted_label])
+ eating spaghetti
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.videomae(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+
+ if self.fc_norm is not None:
+ sequence_output = self.fc_norm(sequence_output.mean(1))
+ else:
+ sequence_output = sequence_output[:, 0]
+
+ logits = self.classifier(sequence_output)
+
+ loss = None
+ if labels is not None:
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return ImageClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = ["VideoMAEForPreTraining", "VideoMAEModel", "VideoMAEPreTrainedModel", "VideoMAEForVideoClassification"]
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bf1553c4a5a784afd3ca980ba2d1f34243a92bd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3c065ec4bb7ffffa003bf7e91e2250641bd9566c2d426b281fba5e8af3e307c
+size 103914
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modeling_wav2vec2_conformer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modeling_wav2vec2_conformer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e67461d397733b558418466f2d4daa7691a94f40
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modeling_wav2vec2_conformer.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:124cfe6ae5b8c035d195fc081742b551fcd17a5145bbbb4630cd7b040deae38f
+size 109498
diff --git a/.venv/lib/python3.11/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..980e76b302eb0a816d4887e897b5188a718f919b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be4b8ac77dd330db2b15982706580b44340a5bfd52509977f2e8a8a5106c8cf2
+size 108643