diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3e428d29048a9b9f4c120964b713b193c1590ebd
--- /dev/null
+++ b/.dev_scripts/dockerci.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache
+CODE_DIR=$PWD
+CODE_DIR_IN_CONTAINER=/ms-swift
+echo "$USER"
+gpus='0,1 2,3'
+cpu_sets='0-15 16-31'
+cpu_sets_arr=($cpu_sets)
+is_get_file_lock=false
+CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml}
+echo "ci command: $CI_COMMAND"
+PR_CHANGED_FILES="${PR_CHANGED_FILES:-}"
+echo "PR modified files: $PR_CHANGED_FILES"
+PR_CHANGED_FILES=${PR_CHANGED_FILES//[ ]/#}
+echo "PR_CHANGED_FILES: $PR_CHANGED_FILES"
+idx=0
+for gpu in $gpus
+do
+ exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
+ flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; }
+ echo "get gpu lock $gpu"
+
+ CONTAINER_NAME="swift-ci-$idx"
+ let is_get_file_lock=true
+
+ # pull image if there are update
+ docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
+ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
+ echo 'debugging'
+ docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
+ --cpuset-cpus=${cpu_sets_arr[$idx]} \
+ --gpus='"'"device=$gpu"'"' \
+ -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
+ -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
+ -v $MODELSCOPE_HOME_CACHE/$idx:/root \
+ -v /home/admin/pre-commit:/home/admin/pre-commit \
+ -e CI_TEST=True \
+ -e TEST_LEVEL=$TEST_LEVEL \
+ -e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
+ -e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \
+ -e MODELSCOPE_SDK_DEBUG=True \
+ -e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
+ -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
+ -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
+ -e TEST_LEVEL=$TEST_LEVEL \
+ -e MODELSCOPE_ENVIRONMENT='ci' \
+ -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
+ -e MODEL_TAG_URL=$MODEL_TAG_URL \
+ -e MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN \
+ -e PR_CHANGED_FILES=$PR_CHANGED_FILES \
+ --workdir=$CODE_DIR_IN_CONTAINER \
+ ${IMAGE_NAME}:${IMAGE_VERSION} \
+ $CI_COMMAND
+ else
+ docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
+ --cpuset-cpus=${cpu_sets_arr[$idx]} \
+ --gpus='"'"device=$gpu"'"' \
+ -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
+ -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
+ -v $MODELSCOPE_HOME_CACHE/$idx:/root \
+ -v /home/admin/pre-commit:/home/admin/pre-commit \
+ -e CI_TEST=True \
+ -e TEST_LEVEL=$TEST_LEVEL \
+ -e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
+ -e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \
+ -e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
+ -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
+ -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
+ -e TEST_LEVEL=$TEST_LEVEL \
+ -e MODELSCOPE_ENVIRONMENT='ci' \
+ -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
+ -e MODEL_TAG_URL=$MODEL_TAG_URL \
+ -e MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN \
+ -e PR_CHANGED_FILES=$PR_CHANGED_FILES \
+ --workdir=$CODE_DIR_IN_CONTAINER \
+ ${IMAGE_NAME}:${IMAGE_VERSION} \
+ $CI_COMMAND
+ fi
+ if [ $? -ne 0 ]; then
+ echo "Running test case failed, please check the log!"
+ exit -1
+ fi
+ break
+done
+if [ "$is_get_file_lock" = false ] ; then
+ echo 'No free GPU!'
+ exit 1
+fi
diff --git a/.gitattributes b/.gitattributes
index 375c7e662659a9b787545e4aeb0ec894bba5e3ed..122c8ed7d7d0eb7ba2a1a320541a373d5f4a8475 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -55,3 +55,4 @@ docs/resources/web-ui-en.jpg filter=lfs diff=lfs merge=lfs -text
docs/resources/kto_data.png filter=lfs diff=lfs merge=lfs -text
docs/resources/grpo_countdown_1.png filter=lfs diff=lfs merge=lfs -text
docs/resources/grpo_clevr_count.png filter=lfs diff=lfs merge=lfs -text
+docs/resources/grpo_code.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1590ec4cf053e79bb11c6094341c712ca1288a02
--- /dev/null
+++ b/.github/workflows/citest.yaml
@@ -0,0 +1,75 @@
+name: citest
+
+on:
+ push:
+ branches:
+ - master
+ - "release/**"
+ paths-ignore:
+ - "setup.*"
+ - "requirements.txt"
+ - "requirements/**"
+ - "docs/**"
+ - "tools/**"
+ - ".dev_scripts/**"
+ - "README.md"
+ - "README_*.md"
+ - "NOTICE"
+ - ".github/workflows/lint.yaml"
+ - ".github/workflows/publish.yaml"
+
+ pull_request:
+ paths-ignore:
+ - "setup.*"
+ - "requirements.txt"
+ - "requirements/**"
+ - "docs/**"
+ - "tools/**"
+ - ".dev_scripts/**"
+ - "README.md"
+ - "README_*.md"
+ - "NOTICE"
+ - ".github/workflows/lint.yaml"
+ - ".github/workflows/publish.yaml"
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ unittest:
+ # The type of runner that the job will run on
+ runs-on: [self-hosted]
+ timeout-minutes: 240
+ steps:
+ - name: ResetFileMode
+ shell: bash
+ run: |
+ # reset filemode to allow action runner to delete files
+ # generated by root in docker
+ set -e
+ source ~/.bashrc
+ sudo chown -R $USER:$USER $ACTION_RUNNER_DIR
+
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ lfs: 'true'
+ submodules: 'true'
+ fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
+ - name: Get changed files
+ id: changed-files
+ run: |
+ if ${{ github.event_name == 'pull_request' }}; then
+ echo "PR_CHANGED_FILES=$(git diff --name-only -r HEAD^1 HEAD | xargs)" >> $GITHUB_ENV
+ else
+ echo "PR_CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.event.after }} | xargs)" >> $GITHUB_ENV
+ fi
+ - name: Checkout LFS objects
+ run: git lfs checkout
+ - name: Run unittest
+ shell: bash
+ run: |
+ set -e
+ source /mnt/modelscope/ci_env.sh
+ bash .dev_scripts/dockerci.sh
diff --git a/docs/resources/grpo_code.png b/docs/resources/grpo_code.png
new file mode 100644
index 0000000000000000000000000000000000000000..021f8d996b8b376a75e38dae8dc27226098c90a3
--- /dev/null
+++ b/docs/resources/grpo_code.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f396d9ce5ce9de323d7a6ffa8d53f783d938a242088b191f46a293268193b64
+size 294339
diff --git a/docs/transformers/build/lib/transformers/models/cpm/tokenization_cpm.py b/docs/transformers/build/lib/transformers/models/cpm/tokenization_cpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..496c0b1ccb37de71e7ff7cb5366254be248e81e2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/cpm/tokenization_cpm.py
@@ -0,0 +1,350 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+import os
+import unicodedata
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import SPIECE_UNDERLINE, logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+
+@requires(backends=("sentencepiece",))
+class CpmTokenizer(PreTrainedTokenizer):
+ """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
+
+ vocab_files_names = VOCAB_FILES_NAMES
+
+ def __init__(
+ self,
+ vocab_file,
+ do_lower_case=False,
+ remove_space=True,
+ keep_accents=False,
+ bos_token="",
+ eos_token="",
+ unk_token="",
+ sep_token="",
+ pad_token="",
+ cls_token="",
+ mask_token="",
+ additional_special_tokens=["", ""],
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ **kwargs,
+ ) -> None:
+ """
+ Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
+ [SentencePiece](https://github.com/google/sentencepiece).
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should
+ refer to this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
+ contains the vocabulary necessary to instantiate a tokenizer.
+ do_lower_case (`bool`, *optional*, defaults to `True`):
+ Whether to lowercase the input when tokenizing.
+ remove_space (`bool`, *optional*, defaults to `True`):
+ Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+ keep_accents (`bool`, *optional*, defaults to `False`):
+ Whether to keep accents when tokenizing.
+ bos_token (`str`, *optional*, defaults to `""`):
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
+ token.
+
+
+
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
+ The end of sequence token.
+
+
+
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ unk_token (`str`, *optional*, defaults to `""`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
+ this token instead.
+ sep_token (`str`, *optional*, defaults to `""`):
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+ for sequence classification or for a text and a question for question answering. It is also used as the
+ last token of a sequence built with special tokens.
+ pad_token (`str`, *optional*, defaults to `""`):
+ The token used for padding, for example when batching sequences of different lengths.
+ cls_token (`str`, *optional*, defaults to `""`):
+ The classifier token which is used when doing sequence classification (classification of the whole
+ sequence instead of per-token classification). It is the first token of the sequence when built with
+ special tokens.
+ mask_token (`str`, *optional*, defaults to `""`):
+ The token used for masking values. This is the token used when training this model with masked language
+ modeling. This is the token which the model will try to predict.
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["", ""]`):
+ Additional special tokens used by the tokenizer.
+
+ Attributes:
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+ """
+ # Mask token behave like a normal word, i.e. include the space before it
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+ self.do_lower_case = do_lower_case
+ self.remove_space = remove_space
+ self.keep_accents = keep_accents
+ self.vocab_file = vocab_file
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
+ try:
+ import jieba
+ except ModuleNotFoundError as error:
+ raise error.__class__(
+ "You need to install jieba to use CpmTokenizer or CpmTokenizerFast. "
+ "See https://pypi.org/project/jieba/ for installation."
+ )
+ self.jieba = jieba
+ self.translator = str.maketrans(" \n", "\u2582\u2583")
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ remove_space=remove_space,
+ keep_accents=keep_accents,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ additional_special_tokens=additional_special_tokens,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
+ self._pad_token_type_id = 3
+
+ @property
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.vocab_size
+ def vocab_size(self):
+ return len(self.sp_model)
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.get_vocab
+ def get_vocab(self):
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__getstate__
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["sp_model"] = None
+ return state
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__setstate__
+ def __setstate__(self, d):
+ self.__dict__ = d
+
+ # for backward compatibility
+ if not hasattr(self, "sp_model_kwargs"):
+ self.sp_model_kwargs = {}
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(self.vocab_file)
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.preprocess_text
+ def preprocess_text(self, inputs):
+ if self.remove_space:
+ outputs = " ".join(inputs.strip().split())
+ else:
+ outputs = inputs
+ outputs = outputs.replace("``", '"').replace("''", '"')
+
+ if not self.keep_accents:
+ outputs = unicodedata.normalize("NFKD", outputs)
+ outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+ if self.do_lower_case:
+ outputs = outputs.lower()
+
+ return outputs
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._tokenize
+ def _tokenize(self, text: str) -> List[str]:
+ """Tokenize a string."""
+ text = self.preprocess_text(text)
+ pieces = self.sp_model.encode(text, out_type=str)
+ new_pieces = []
+ for piece in pieces:
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+ cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
+ if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+ if len(cur_pieces[0]) == 1:
+ cur_pieces = cur_pieces[1:]
+ else:
+ cur_pieces[0] = cur_pieces[0][1:]
+ cur_pieces.append(piece[-1])
+ new_pieces.extend(cur_pieces)
+ else:
+ new_pieces.append(piece)
+
+ return new_pieces
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._convert_token_to_id
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.PieceToId(token)
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._convert_id_to_token
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ return self.sp_model.IdToPiece(index)
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.convert_tokens_to_string
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+ return out_string
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.build_inputs_with_special_tokens
+ def build_inputs_with_special_tokens(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+ adding special tokens. An XLNet sequence has the following format:
+
+ - single sequence: `X `
+ - pair of sequences: `A B `
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs to which the special tokens will be added.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+ """
+ sep = [self.sep_token_id]
+ cls = [self.cls_token_id]
+ if token_ids_1 is None:
+ return token_ids_0 + sep + cls
+ return token_ids_0 + sep + token_ids_1 + sep + cls
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.get_special_tokens_mask
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ """
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` method.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is not None:
+ return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
+ return ([0] * len(token_ids_0)) + [1, 1]
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.create_token_type_ids_from_sequences
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
+ sequence pair mask has the following format:
+
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
+
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+ """
+ sep = [self.sep_token_id]
+ cls_segment_id = [2]
+
+ if token_ids_1 is None:
+ return len(token_ids_0 + sep) * [0] + cls_segment_id
+ return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
+
+ # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.save_vocabulary
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def _decode(self, *args, **kwargs):
+ text = super()._decode(*args, **kwargs)
+ text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")
+ return text
+
+
+__all__ = ["CpmTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/cpmant/modeling_cpmant.py b/docs/transformers/build/lib/transformers/models/cpmant/modeling_cpmant.py
new file mode 100644
index 0000000000000000000000000000000000000000..df0aebe3cbf850d5ce022352c1f52e9b2d1f8d0e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/cpmant/modeling_cpmant.py
@@ -0,0 +1,860 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CPMAnt"""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_cpmant import CpmAntConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "openbmb/cpm-ant-10b"
+_CONFIG_FOR_DOC = "CpmAntConfig"
+
+
+class CpmAntLayerNorm(nn.Module):
+ """
+ We use Root Mean Square (RMS) Layer Normalization, please see https://arxiv.org/abs/1910.07467 for details."
+ """
+
+ def __init__(self, config: CpmAntConfig):
+ super().__init__()
+
+ self.eps = config.eps
+ self.dim_norm = config.hidden_size
+ self.weight = nn.Parameter(torch.empty(config.hidden_size))
+
+ def forward(self, hidden_states: torch.Tensor):
+ """
+ Args:
+ hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
+ """
+ if hidden_states.size(-1) != self.dim_norm:
+ raise AssertionError("hidden_states.size(-1) != self.dim_norm")
+ old_dtype = hidden_states.dtype
+ variance = hidden_states.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
+ hidden_states = (hidden_states * torch.rsqrt(variance + self.eps)).to(old_dtype) * self.weight
+ return hidden_states
+
+
+class CpmAntAttention(nn.Module):
+ def __init__(self, config: CpmAntConfig):
+ super().__init__()
+ self.dim_model = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.dim_head = config.dim_head
+
+ self.project_q = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
+ self.project_k = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
+ self.project_v = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
+
+ self.attention_out = nn.Linear(self.num_heads * self.dim_head, self.dim_model, bias=False)
+
+ self.softmax = torch.nn.Softmax(dim=-1)
+
+ if config.dropout_p is not None:
+ self.dropout = torch.nn.Dropout(p=config.dropout_p)
+ else:
+ self.dropout = None
+
+ def forward(
+ self,
+ hidden_q: torch.Tensor,
+ hidden_kv: torch.Tensor,
+ attention_mask: torch.BoolTensor,
+ position_bias: torch.Tensor,
+ output_attentions: Optional[bool] = False,
+ past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+ use_cache: Optional[bool] = None,
+ ):
+ """
+ Args:
+ hidden_q (`torch.Tensor`):
+ Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
+ hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
+ Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
+ attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+ Avoid invalid areas to participate in the calculation of self-attention.
+ position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+ Provide positional information to self-attention block.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers.
+ past_key_values (`Tuple[torch.Tensor, torch.Tensor]`, *optional*):
+ Cached past key and value projection states.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ """
+ batch_size = hidden_q.size(0)
+ len_q = hidden_q.size(1)
+ len_k = hidden_kv.size(1)
+
+ query = self.project_q(hidden_q)
+ key = self.project_k(hidden_kv)
+ value = self.project_v(hidden_kv)
+
+ query = query.view(batch_size, len_q, self.num_heads, self.dim_head).permute(0, 2, 1, 3)
+ key = key.view(batch_size, len_k, self.num_heads, self.dim_head).permute(0, 2, 1, 3)
+ value = value.view(batch_size, len_k, self.num_heads, self.dim_head).permute(0, 2, 1, 3)
+
+ if past_key_values is not None:
+ key = torch.cat([past_key_values[0], key], dim=-2)
+ value = torch.cat([past_key_values[1], value], dim=-2)
+ len_k = key.size(-2)
+
+ # (batch_size, num_heads, len_q, dim_head) @ (batch_size, num_heads, dim_head, len_k) -> (batch_size, num_heads, len_q, len_k)
+ score = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(self.dim_head)
+ score = score + position_bias
+
+ score = torch.masked_fill(
+ score,
+ attention_mask.view(batch_size, 1, len_q, len_k) == torch.tensor(False),
+ torch.scalar_tensor(float("-inf"), device=score.device, dtype=score.dtype),
+ )
+ score = self.softmax(score)
+
+ score = torch.masked_fill(
+ score,
+ attention_mask.view(batch_size, 1, len_q, len_k) == torch.tensor(False),
+ torch.scalar_tensor(0, device=score.device, dtype=score.dtype),
+ )
+ if output_attentions:
+ attn_weights = score
+ else:
+ attn_weights = None
+
+ if self.dropout is not None:
+ score = self.dropout(score)
+
+ # (batch_size, num_heads, len_q, len_k) @ (batch_size, num_heads, len_k, dim_head) -> (batch_size, num_heads, len_q, dim_head)
+ score = torch.matmul(score, value)
+
+ score = score.view(batch_size, self.num_heads, len_q, self.dim_head).permute(0, 2, 1, 3)
+ score = score.contiguous().view(batch_size, len_q, self.num_heads * self.dim_head)
+
+ score = self.attention_out(score)
+
+ past_key_values = None
+ if use_cache:
+ past_key_values = (key, value)
+
+ return score, attn_weights, past_key_values
+
+
+class CpmAntSelfAttentionBlock(nn.Module):
+ def __init__(self, config: CpmAntConfig):
+ super().__init__()
+ self.layernorm_before_attention = CpmAntLayerNorm(config)
+ self.self_attention = CpmAntAttention(config)
+ if config.dropout_p:
+ self.dropout = torch.nn.Dropout(config.dropout_p)
+ else:
+ self.dropout = None
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ position_bias: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = False,
+ past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+ use_cache: Optional[bool] = None,
+ ):
+ """
+ Args:
+ hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
+ Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
+ attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+ Avoid invalid areas to participate in the calculation of self-attention.
+ position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+ Provide positional information to self-attention block.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers.
+ past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
+ Cached past key and value projection states.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ """
+ outputs = self.layernorm_before_attention(hidden_states)
+ outputs = self.self_attention(
+ outputs, outputs, attention_mask, position_bias, output_attentions, past_key_values, use_cache
+ )
+
+ outputs, attn_weights, current_key_value = outputs
+
+ if self.dropout is not None:
+ outputs = self.dropout(outputs)
+ hidden_states = hidden_states + outputs
+
+ return hidden_states, attn_weights, current_key_value
+
+
+class CpmAntDenseGatedACT(nn.Module):
+ def __init__(self, config: CpmAntConfig):
+ super().__init__()
+ self.w_0 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
+ self.w_1 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
+ self.act = torch.nn.GELU()
+
+ def forward(self, hidden_states: torch.Tensor):
+ """Transform an input tensor from one feature space to another via a nonlinear operation
+
+ Args:
+ hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
+ """
+ gate_score = self.act(self.w_0(hidden_states))
+ hidden_states = self.w_1(hidden_states)
+
+ hidden_states = gate_score * hidden_states
+ return hidden_states
+
+
+class CpmAntFeedForward(nn.Module):
+ def __init__(self, config: CpmAntConfig):
+ super().__init__()
+ self.w_in = CpmAntDenseGatedACT(config)
+ if config.dropout_p is not None:
+ self.dropout = torch.nn.Dropout(config.dropout_p)
+ else:
+ self.dropout = None
+
+ self.w_out = nn.Linear(config.dim_ff, config.hidden_size, bias=False)
+
+ def forward(self, hidden_states: torch.Tensor):
+ """
+ Args:
+ hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
+ """
+ hidden_states = self.w_in(hidden_states)
+
+ if self.dropout is not None:
+ hidden_states = self.dropout(hidden_states)
+
+ hidden_states = self.w_out(hidden_states)
+
+ return hidden_states
+
+
+class CpmAntFFNBlock(nn.Module):
+ def __init__(self, config: CpmAntConfig):
+ super().__init__()
+ self.layernorm_before_ffn = CpmAntLayerNorm(config)
+ self.ffn = CpmAntFeedForward(config)
+ if config.dropout_p:
+ self.dropout = torch.nn.Dropout(config.dropout_p)
+ else:
+ self.dropout = None
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ ):
+ """
+ Args:
+ hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
+ Hidden states before feed forward layer.
+ """
+ ln_outputs = self.layernorm_before_ffn(hidden_states)
+ outputs = self.ffn(ln_outputs)
+ if self.dropout is not None:
+ outputs = self.dropout(outputs)
+ hidden_states = hidden_states + outputs
+ return hidden_states
+
+
+class CpmAntTransformerBlock(nn.Module):
+ def __init__(self, config: CpmAntConfig):
+ super().__init__()
+ self.self_att = CpmAntSelfAttentionBlock(config)
+ self.ffn = CpmAntFFNBlock(config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ position_bias: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = False,
+ past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+ use_cache: Optional[bool] = None,
+ ):
+ """
+ Args:
+ hidden_states (`torch.Tensor`):
+ Input to the layer of shape `(batch, seq_len, dim_model)`
+ attention_mask (`torch.Tensor`):
+ Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
+ position_bias (`torch.Tensor`):
+ Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers.
+ past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
+ Cached past key and value projection states
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ """
+ hidden_states = self.self_att(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_bias=position_bias,
+ output_attentions=output_attentions,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ )
+
+ hidden_states, attn_weights, current_key_value = hidden_states
+
+ hidden_states = self.ffn(hidden_states)
+
+ return hidden_states, attn_weights, current_key_value
+
+
+class CpmAntEncoder(nn.Module):
+ def __init__(self, config: CpmAntConfig):
+ super().__init__()
+ self.num_layers = config.num_hidden_layers
+ self.layers = nn.ModuleList([CpmAntTransformerBlock(config) for ith in range(self.num_layers)])
+
+ self.output_layernorm = CpmAntLayerNorm(config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ position_bias: torch.Tensor,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+ use_cache: Optional[bool] = None,
+ ):
+ """
+ Args:
+ hidden_states (`torch.Tensor`):
+ Input to the layer of shape `(batch, seq_len, dim_model)`
+ attention_mask (`torch.Tensor`):
+ Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
+ position_bias (`torch.Tensor`):
+ Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers.
+ past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
+ Cached past key and value projection states
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ """
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ current_key_values = () if use_cache else None
+
+ for i, layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+ layer_outputs = layer(
+ hidden_states,
+ attention_mask,
+ position_bias,
+ output_attentions=output_attentions,
+ past_key_values=past_key_values[i] if past_key_values else None,
+ use_cache=use_cache,
+ )
+ hidden_states, attn_weights, current_key_value = layer_outputs
+ if output_attentions:
+ all_self_attns += (attn_weights,)
+ if current_key_value is not None:
+ current_key_values = current_key_values + (current_key_value,)
+
+ hidden_states = self.output_layernorm(hidden_states)
+
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ return hidden_states, current_key_values, all_hidden_states, all_self_attns
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->CPMAnt
+class CpmAntIntermediate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+class CpmAntSegmentPositionEmbedding(nn.Module):
+ def __init__(self, config: CpmAntConfig):
+ super().__init__()
+
+ self.num_heads = config.num_attention_heads
+ self.num_buckets = config.position_bias_num_buckets
+ self.max_distance = config.position_bias_max_distance
+ self.num_segments = config.segment_types
+
+ self.relative_attention_bias = nn.Parameter(
+ torch.empty(
+ config.segment_types * config.segment_types + config.position_bias_num_buckets,
+ config.num_attention_heads,
+ )
+ )
+
+ def forward(
+ self,
+ key_pos: torch.Tensor,
+ query_pos: torch.Tensor,
+ key_segment: torch.Tensor,
+ query_segment: torch.Tensor,
+ ):
+ with torch.no_grad():
+ batch = key_pos.size(0)
+ keylen = key_pos.size(1)
+ querylen = query_pos.size(1)
+
+ if key_pos.size(0) != query_pos.size(0):
+ raise AssertionError(
+ f"key_pos.size(0) should be equal to query_pos.size(0), but got {key_pos.size(0)} and {query_pos.size(0)}!"
+ )
+ if keylen != key_segment.size(1) or querylen != query_segment.size(1):
+ raise AssertionError(
+ f"keylen should be equal to key_segment.size(1), but got {keylen} and {key_segment.size(1)}!"
+ )
+ if querylen != query_segment.size(1):
+ raise AssertionError(
+ f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.szie(1)}!"
+ )
+
+ key_pos = key_pos.view(batch, -1, keylen)
+ query_pos = query_pos.view(batch, querylen, -1)
+ key_segment = key_segment.view(batch, -1, keylen)
+ query_segment = query_segment.view(batch, querylen, -1)
+
+ relative_position_bucket = self._segment_relative_position_bucket(query_segment, key_segment)
+ relative_position_bucket = relative_position_bucket + self.num_buckets
+
+ # (batch, len_q, len_k)
+ absolute_position_bucket = self._position_bucket(
+ torch.arange(keylen, dtype=torch.int32, device=relative_position_bucket.device)[None, :]
+ - torch.arange(querylen, dtype=torch.int32, device=relative_position_bucket.device)[:, None],
+ num_buckets=self.num_buckets,
+ max_distance=self.max_distance,
+ )
+ relative_position_bucket = torch.where(
+ (key_segment == query_segment),
+ absolute_position_bucket[None, :, :],
+ relative_position_bucket,
+ )
+
+ # (batch, len_q, len_k, num_heads)
+ embeds = F.embedding(relative_position_bucket, self.relative_attention_bias)
+ # (batch, num_heads, len_q, len_k)
+ embeds = embeds.permute(0, 3, 1, 2).contiguous()
+ return embeds
+
+ def _segment_relative_position_bucket(self, query_segment, key_segment):
+ return query_segment * self.num_segments + key_segment
+
+ def _position_bucket(self, relative_position, num_buckets=32, max_distance=128):
+ relative_buckets = 0
+ # always bidirectional in CPMAnt
+ num_buckets //= 2
+ relative_buckets = (relative_position > 0).to(torch.int32) * num_buckets
+ relative_position = torch.abs(relative_position)
+ max_exact = num_buckets // 2
+ is_small = relative_position < max_exact
+ relative_postion_if_large = max_exact + (
+ torch.log(relative_position.float() / max_exact)
+ / math.log(max_distance / max_exact)
+ * (num_buckets - max_exact)
+ ).to(torch.int32)
+ relative_postion_if_large = torch.min(
+ relative_postion_if_large,
+ torch.full_like(relative_postion_if_large, num_buckets - 1),
+ )
+ relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large)
+ return relative_buckets
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->CPMAnt
+class CpmAntOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class CpmAntPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = CpmAntConfig
+ base_model_prefix = "cpmant"
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, CpmAntLayerNorm):
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, CpmAntSegmentPositionEmbedding):
+ module.relative_attention_bias.data.normal_(mean=0.0, std=self.config.init_std)
+
+
+CPMANT_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters
+ config ([`~CpmAntConfig`]): Model configuration class with all the parameters of the
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CPMANT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare CPMAnt Model outputting raw hidden-states without any specific head on top.",
+ CPMANT_START_DOCSTRING,
+)
+class CpmAntModel(CpmAntPreTrainedModel):
+ def __init__(self, config: CpmAntConfig):
+ super().__init__(config)
+ self.encoder = CpmAntEncoder(config)
+ self.segment_embedding = nn.Embedding(config.segment_types, config.hidden_size)
+ self.input_embedding = nn.Embedding(
+ config.vocab_size + config.prompt_types * config.prompt_length, config.hidden_size
+ )
+ self.position_bias = CpmAntSegmentPositionEmbedding(config)
+ self.prompt_length = config.prompt_length
+ self.vocab_size = config.vocab_size
+
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.input_embedding
+
+ def set_input_embeddings(self, embeddings, **kwargs):
+ self.input_embedding = embeddings
+
+ def _prepare_attention_mask(self, input_ids, span, context, length):
+ batch = input_ids.size(0)
+ seqlen = input_ids.size(1)
+ device = input_ids.device
+ directional_mask_2d = torch.arange(seqlen, device=device) <= torch.arange(seqlen, device=device).view(-1, 1)
+ attention_mask = context[:, None, :] | (
+ context[:, :, None].logical_not() & directional_mask_2d.view(1, seqlen, seqlen)
+ )
+ attention_mask = attention_mask & (span[:, None, :] == span[:, :, None])
+ # mask for left padding
+ mask_1d = (
+ torch.tensor(list(range(seqlen - self.prompt_length))[::-1], device=device)[None, :].repeat(batch, 1)
+ < length[:, None]
+ )
+ mask_1d = torch.cat((torch.ones(batch, self.prompt_length, device=device).bool(), mask_1d), dim=1)
+ attention_mask = mask_1d.view(batch, seqlen, 1) & mask_1d.view(batch, 1, seqlen) & attention_mask
+ return attention_mask
+
+ @add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutputWithPast,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+ use_cache: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ # add prompts ahead
+ if input_ids.dtype != torch.int32:
+ input_ids = input_ids.to(torch.int32)
+ dtype, device = input_ids.dtype, input_ids.device
+ segment = torch.where(input_ids != 0, 2, 0).to(dtype=dtype, device=device)
+ length = (segment != 0).sum(-1).to(dtype=dtype, device=device)
+ input_ids = torch.cat(
+ (
+ torch.arange(
+ self.prompt_length * 2 + self.vocab_size,
+ self.prompt_length * 3 + self.vocab_size,
+ dtype=dtype,
+ device=device,
+ ).repeat(input_ids.size(0), 1),
+ input_ids,
+ ),
+ dim=1,
+ )
+ batch, seq_length = input_ids.size()
+ segment = torch.cat((torch.zeros(batch, self.prompt_length, dtype=dtype, device=device), segment), dim=1)
+ context = torch.full((batch, seq_length), 1, dtype=dtype, device=device)
+ position = torch.arange(seq_length, dtype=dtype, device=device).repeat(batch, 1)
+ span = torch.full((batch, seq_length), 0, dtype=dtype, device=device)
+
+ if past_key_values is None:
+ past_length = 0
+ past_key_values = tuple([None] * self.encoder.num_layers)
+ input_ids = input_ids.contiguous()
+ hidden_states = self.input_embedding(input_ids)
+ segment_states = self.segment_embedding(segment)
+ hidden_states = hidden_states + segment_states
+ else:
+ past_length = past_key_values[0][0].size(-2)
+ segment_states = self.segment_embedding(segment)
+ hidden_states = self.input_embedding(input_ids) + segment_states[:, -1:, :]
+
+ attention_mask = self._prepare_attention_mask(input_ids, span, context, length)
+ position_bias = self.position_bias(position, position, segment, segment)
+
+ attention_mask = attention_mask[:, past_length:, :]
+ position_bias = position_bias[:, :, past_length:, :]
+ hidden_states = hidden_states[:, past_length:, :]
+
+ hidden_states, present_key_values, all_hidden_states, all_attentions = self.encoder(
+ hidden_states,
+ attention_mask,
+ position_bias,
+ output_attentions,
+ output_hidden_states,
+ past_key_values,
+ use_cache,
+ )
+
+ if past_length == 0:
+ hidden_states = hidden_states[:, self.prompt_length :, :]
+ # drop the prompt
+ if all_attentions is not None:
+ new_attentions = ()
+ for attention in all_attentions:
+ new_attentions += (attention[:, :, self.prompt_length :, self.prompt_length :],)
+ all_attentions = new_attentions
+ if all_hidden_states is not None:
+ new_hidden_states = ()
+ for hidden_state in all_hidden_states:
+ new_hidden_states += (hidden_state[:, self.prompt_length :, :],)
+ all_hidden_states = new_hidden_states
+
+ if not return_dict:
+ return tuple(
+ v for v in [hidden_states, present_key_values, all_hidden_states, all_attentions] if v is not None
+ )
+
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=present_key_values,
+ hidden_states=all_hidden_states,
+ attentions=all_attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
+ """,
+ CPMANT_START_DOCSTRING,
+)
+class CpmAntForCausalLM(CpmAntPreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config: CpmAntConfig):
+ super().__init__(config)
+ self.cpmant = CpmAntModel(config)
+
+ # lm_head.weight is tied to cpmant.input_embedding.weight
+ self.lm_head = nn.Linear(
+ config.hidden_size, config.vocab_size + config.prompt_types * config.prompt_length, bias=False
+ )
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=CausalLMOutputWithPast,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ labels: Optional[torch.Tensor] = None,
+ return_dict: Optional[bool] = None,
+ attention_mask: Optional[torch.Tensor] = None, # dummy parameter for text-generation pipeline
+ **kwargs,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers.
+ labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ CPMAnt will process attention mask automatically, this parameter is a dummy parameter for
+ text-generation pipeline.
+
+ Example:
+
+ Text Generation with CpmAntForCausalLM.
+ ```python
+ >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM
+
+ >>> texts = "今天天气不错,"
+ >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
+ >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
+ >>> input_ids = tokenizer(texts, return_tensors="pt")
+ >>> outputs = model.generate(**input_ids)
+ >>> output_texts = tokenizer.batch_decode(outputs)
+ >>> print(output_texts)
+ ['今天天气不错,阳光明媚,我和妈妈一起去超市买东西。\n在超市里,我看到了一个很好玩的玩具,它的名字叫“机器人”。它有一个圆圆的脑袋,两只圆圆的眼睛,还有一个圆圆的']
+ ```
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ model_output = self.cpmant(
+ input_ids, output_attentions, output_hidden_states, past_key_values, use_cache, return_dict
+ )
+ hidden_states = model_output.last_hidden_state if return_dict else model_output[0]
+
+ logits = self.lm_head(hidden_states)
+
+ loss = None
+ if labels is not None:
+ loss_func = CrossEntropyLoss()
+ loss = loss_func(logits.view(-1, logits.size(-1)), labels.view(-1))
+
+ if not return_dict:
+ output = (logits,) + model_output[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=model_output.past_key_values,
+ hidden_states=model_output.hidden_states,
+ attentions=model_output.attentions,
+ )
+
+ def get_input_embeddings(self):
+ return self.cpmant.input_embedding
+
+ def set_input_embeddings(self, embeddings):
+ self.cpmant.input_embedding = embeddings
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def _reorder_cache(self, past_key_values, beam_idx):
+ past_key_values = [list(each) if each is not None else each for each in past_key_values]
+ for key_value_layer in past_key_values:
+ key_value_layer[0] = key_value_layer[0][beam_idx]
+ key_value_layer[1] = key_value_layer[1][beam_idx]
+ return past_key_values
+
+
+__all__ = ["CpmAntForCausalLM", "CpmAntModel", "CpmAntPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/cpmant/tokenization_cpmant.py b/docs/transformers/build/lib/transformers/models/cpmant/tokenization_cpmant.py
new file mode 100644
index 0000000000000000000000000000000000000000..2da1d6286c5e8b3a39750ae705cc1486ddd63416
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/cpmant/tokenization_cpmant.py
@@ -0,0 +1,270 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for CPMAnt."""
+
+import collections
+import os
+from typing import List, Optional, Tuple
+
+from transformers.utils import is_jieba_available, requires_backends
+
+
+if is_jieba_available():
+ import jieba
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+def load_vocab(vocab_file):
+ """Loads a vocabulary file into a dictionary."""
+ vocab = collections.OrderedDict()
+ with open(vocab_file, "r", encoding="utf-8") as reader:
+ tokens = reader.readlines()
+ for index, token in enumerate(tokens):
+ token = token.rstrip("\n")
+ vocab[token] = index
+ return vocab
+
+
+class WordpieceTokenizer:
+ def __init__(self, vocab, unk_token="", max_input_chars_per_word=200):
+ self.vocab = vocab
+ self.unk_token = unk_token
+ self.max_input_chars_per_word = max_input_chars_per_word
+
+ def tokenize(self, token):
+ chars = list(token)
+ if len(chars) > self.max_input_chars_per_word:
+ return [self.unk_token]
+
+ start = 0
+ sub_tokens = []
+ while start < len(chars):
+ end = len(chars)
+ cur_substr = None
+ while start < end:
+ substr = "".join(chars[start:end])
+ if substr in self.vocab:
+ cur_substr = substr
+ break
+ end -= 1
+ if cur_substr is None:
+ sub_tokens.append(self.unk_token)
+ start += 1
+ else:
+ sub_tokens.append(cur_substr)
+ start = end
+
+ return sub_tokens
+
+
+class CpmAntTokenizer(PreTrainedTokenizer):
+ """
+ Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ bod_token (`str`, *optional*, defaults to `""`):
+ The beginning of document token.
+ eod_token (`str`, *optional*, defaults to `""`):
+ The end of document token.
+ bos_token (`str`, *optional*, defaults to `""`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `""`):
+ The end of sequence token.
+ pad_token (`str`, *optional*, defaults to `""`):
+ The token used for padding.
+ unk_token (`str`, *optional*, defaults to `""`):
+ The unknown token.
+ line_token (`str`, *optional*, defaults to `""`):
+ The line token.
+ space_token (`str`, *optional*, defaults to `""`):
+ The space token.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ model_input_names = ["input_ids", "attention_mask"]
+ add_prefix_space = False
+
+ def __init__(
+ self,
+ vocab_file,
+ bod_token="",
+ eod_token="",
+ bos_token="",
+ eos_token="",
+ pad_token="",
+ unk_token="",
+ line_token="",
+ space_token="",
+ padding_side="left",
+ **kwargs,
+ ):
+ requires_backends(self, ["jieba"])
+ self.bod_token = bod_token
+ self.eod_token = eod_token
+ self.encoder = load_vocab(vocab_file)
+ self.encoder[" "] = self.encoder[space_token]
+ self.encoder["\n"] = self.encoder[line_token]
+
+ del self.encoder[space_token]
+ del self.encoder[line_token]
+
+ self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
+ self.decoder = {v: k for k, v in self.encoder.items()}
+
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token)
+
+ super().__init__(
+ bod_token=bod_token,
+ eod_token=eod_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ unk_token=unk_token,
+ line_token=line_token,
+ space_token=space_token,
+ padding_side=padding_side,
+ **kwargs,
+ )
+
+ @property
+ def bod_token_id(self):
+ return self.encoder[self.bod_token]
+
+ @property
+ def eod_token_id(self):
+ return self.encoder[self.eod_token]
+
+ @property
+ def newline_id(self):
+ return self.encoder["\n"]
+
+ @property
+ def vocab_size(self) -> int:
+ return len(self.encoder)
+
+ def get_vocab(self):
+ return dict(self.encoder, **self.added_tokens_encoder)
+
+ def _tokenize(self, text):
+ """Tokenize a string."""
+ output_tokens = []
+ for x in jieba.cut(text, cut_all=False):
+ output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
+ return output_tokens
+
+ def _decode(self, token_ids, **kwargs):
+ """Decode ids into a string."""
+ token_ids = [i for i in token_ids if i >= 0]
+ token_ids = [
+ x for x in token_ids if x != self.pad_token_id and x != self.eos_token_id and x != self.bos_token_id
+ ]
+ return super()._decode(token_ids, **kwargs)
+
+ def check(self, token):
+ return token in self.encoder
+
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
+ return "".join(tokens)
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ return self.decoder.get(index, self.unk_token)
+
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if os.path.isdir(save_directory):
+ vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+ else:
+ vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+ index = 0
+ if " " in self.encoder:
+ self.encoder[""] = self.encoder[" "]
+ del self.encoder[" "]
+ if "\n" in self.encoder:
+ self.encoder[""] = self.encoder["\n"]
+ del self.encoder["\n"]
+ self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
+ with open(vocab_file, "w", encoding="utf-8") as writer:
+ for token, token_index in self.encoder.items():
+ if index != token_index:
+ logger.warning(
+ f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+ " Please check that the vocabulary is not corrupted!"
+ )
+ index = token_index
+ writer.write(token + "\n")
+ index += 1
+ return (vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None) -> List[int]:
+ """
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+ adding special tokens. A CPMAnt sequence has the following format:
+
+ - single sequence: `[BOS] Sequence`.
+
+ Args:
+ token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
+ token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.
+
+ Returns:
+ `List[int]`: The model input with special tokens.
+ """
+ if token_ids_1 is None:
+ return [self.bos_token_id] + token_ids_0
+ return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ """
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` method.
+
+ Args:
+ token_ids_0 (`List[int]`): List of IDs.
+ token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is not None:
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+ return [1] + ([0] * len(token_ids_0))
+
+
+__all__ = ["CpmAntTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/ctrl/configuration_ctrl.py b/docs/transformers/build/lib/transformers/models/ctrl/configuration_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a812f0b55650fab8ab9a4d1b4c6bad4cd9446f4
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/ctrl/configuration_ctrl.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Salesforce CTRL configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CTRLConfig(PretrainedConfig):
+ """
+ This is the configuration class to store the configuration of a [`CTRLModel`] or a [`TFCTRLModel`]. It is used to
+ instantiate a CTRL model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the
+ [Salesforce/ctrl](https://huggingface.co/Salesforce/ctrl) architecture from SalesForce.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 246534):
+ Vocabulary size of the CTRL model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`CTRLModel`] or [`TFCTRLModel`].
+ n_positions (`int`, *optional*, defaults to 256):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ n_embd (`int`, *optional*, defaults to 1280):
+ Dimensionality of the embeddings and hidden states.
+ dff (`int`, *optional*, defaults to 8192):
+ Dimensionality of the inner dimension of the feed forward networks (FFN).
+ n_layer (`int`, *optional*, defaults to 48):
+ Number of hidden layers in the Transformer encoder.
+ n_head (`int`, *optional*, defaults to 16):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ embd_pdrop (`int`, *optional*, defaults to 0.1):
+ The dropout ratio for the embeddings.
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-06):
+ The epsilon to use in the layer normalization layers
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models).
+
+
+ Examples:
+
+ ```python
+ >>> from transformers import CTRLConfig, CTRLModel
+
+ >>> # Initializing a CTRL configuration
+ >>> configuration = CTRLConfig()
+
+ >>> # Initializing a model (with random weights) from the configuration
+ >>> model = CTRLModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "ctrl"
+ keys_to_ignore_at_inference = ["past_key_values"]
+ attribute_map = {
+ "max_position_embeddings": "n_positions",
+ "hidden_size": "n_embd",
+ "num_attention_heads": "n_head",
+ "num_hidden_layers": "n_layer",
+ }
+
+ def __init__(
+ self,
+ vocab_size=246534,
+ n_positions=256,
+ n_embd=1280,
+ dff=8192,
+ n_layer=48,
+ n_head=16,
+ resid_pdrop=0.1,
+ embd_pdrop=0.1,
+ layer_norm_epsilon=1e-6,
+ initializer_range=0.02,
+ use_cache=True,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.n_positions = n_positions
+ self.n_embd = n_embd
+ self.n_layer = n_layer
+ self.n_head = n_head
+ self.dff = dff
+ self.resid_pdrop = resid_pdrop
+ self.embd_pdrop = embd_pdrop
+ self.layer_norm_epsilon = layer_norm_epsilon
+ self.initializer_range = initializer_range
+
+ self.use_cache = use_cache
+
+ super().__init__(**kwargs)
+
+
+__all__ = ["CTRLConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/ctrl/modeling_ctrl.py b/docs/transformers/build/lib/transformers/models/ctrl/modeling_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..f97adac913a13acc2ef4aee5f60d31571c66fa5a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/ctrl/modeling_ctrl.py
@@ -0,0 +1,844 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CTRL model."""
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_ctrl import CTRLConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "CTRLConfig"
+
+
+def angle_defn(pos, i, d_model_size):
+ angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
+ return pos * angle_rates
+
+
+def positional_encoding(position, d_model_size, dtype):
+ # create the sinusoidal pattern for the positional encoding
+ angle_rads = angle_defn(
+ torch.arange(position, dtype=torch.int64).to(dtype).unsqueeze(1),
+ torch.arange(d_model_size, dtype=torch.int64).to(dtype).unsqueeze(0),
+ d_model_size,
+ )
+
+ sines = torch.sin(angle_rads[:, 0::2])
+ cosines = torch.cos(angle_rads[:, 1::2])
+
+ pos_encoding = torch.cat([sines, cosines], dim=-1)
+ return pos_encoding
+
+
+def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
+ # calculate attention
+ matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))
+
+ dk = k.shape[-1]
+ scaled_attention_logits = matmul_qk / np.sqrt(dk)
+
+ if mask is not None:
+ nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
+ scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
+
+ if attention_mask is not None:
+ # Apply the attention mask
+ scaled_attention_logits = scaled_attention_logits + attention_mask
+
+ attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_weights = attention_weights * head_mask
+
+ output = torch.matmul(attention_weights, v)
+
+ return output, attention_weights
+
+
+class MultiHeadAttention(nn.Module):
+ def __init__(self, d_model_size, num_heads):
+ super().__init__()
+ self.num_heads = num_heads
+ self.d_model_size = d_model_size
+
+ self.depth = int(d_model_size / self.num_heads)
+
+ self.Wq = nn.Linear(d_model_size, d_model_size)
+ self.Wk = nn.Linear(d_model_size, d_model_size)
+ self.Wv = nn.Linear(d_model_size, d_model_size)
+
+ self.dense = nn.Linear(d_model_size, d_model_size)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ attention_head_size = self.d_model_size // self.num_heads
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, attention_head_size, self.pruned_heads)
+
+ # Prune linear layers
+ self.Wq = prune_linear_layer(self.Wq, index)
+ self.Wk = prune_linear_layer(self.Wk, index)
+ self.Wv = prune_linear_layer(self.Wv, index)
+ self.dense = prune_linear_layer(self.dense, index, dim=1)
+
+ # Update hyper params
+ self.num_heads = self.num_heads - len(heads)
+ self.d_model_size = attention_head_size * self.num_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def split_into_heads(self, x, batch_size):
+ x = x.reshape(batch_size, -1, self.num_heads, self.depth)
+ return x.permute([0, 2, 1, 3])
+
+ def forward(
+ self,
+ v,
+ k,
+ q,
+ mask,
+ layer_past=None,
+ attention_mask=None,
+ head_mask=None,
+ use_cache=False,
+ output_attentions=False,
+ ):
+ batch_size = q.shape[0]
+
+ q = self.Wq(q)
+ k = self.Wk(k)
+ v = self.Wv(v)
+
+ q = self.split_into_heads(q, batch_size)
+ k = self.split_into_heads(k, batch_size)
+ v = self.split_into_heads(v, batch_size)
+ if layer_past is not None:
+ past_key, past_value = layer_past[0], layer_past[1]
+ k = torch.cat((past_key, k), dim=-2)
+ v = torch.cat((past_value, v), dim=-2)
+
+ if use_cache is True:
+ present = torch.stack((k, v))
+ else:
+ present = (None,)
+
+ output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
+ scaled_attention = output[0].permute([0, 2, 1, 3])
+ attn = output[1]
+ original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size)
+ output = self.dense(original_size_attention)
+
+ outputs = (output, present)
+ if output_attentions:
+ outputs = outputs + (attn,)
+ return outputs
+
+
+def point_wise_feed_forward_network(d_model_size, dff):
+ return nn.Sequential(nn.Linear(d_model_size, dff), nn.ReLU(), nn.Linear(dff, d_model_size))
+
+
+class EncoderLayer(nn.Module):
+ def __init__(self, d_model_size, num_heads, dff, rate=0.1):
+ super().__init__()
+
+ self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads)
+ self.ffn = point_wise_feed_forward_network(d_model_size, dff)
+
+ self.layernorm1 = nn.LayerNorm(d_model_size, eps=1e-6)
+ self.layernorm2 = nn.LayerNorm(d_model_size, eps=1e-6)
+
+ self.dropout1 = nn.Dropout(rate)
+ self.dropout2 = nn.Dropout(rate)
+
+ def forward(
+ self, x, mask, layer_past=None, attention_mask=None, head_mask=None, use_cache=False, output_attentions=False
+ ):
+ normed = self.layernorm1(x)
+ attn_outputs = self.multi_head_attention(
+ normed,
+ normed,
+ normed,
+ mask,
+ layer_past=layer_past,
+ attention_mask=attention_mask,
+ head_mask=head_mask,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ )
+ attn_output = attn_outputs[0]
+ attn_output = self.dropout1(attn_output)
+ out1 = x + attn_output
+
+ out2 = self.layernorm2(out1)
+ ffn_output = self.ffn(out2)
+ ffn_output = self.dropout2(ffn_output)
+ out2 = out1 + ffn_output
+
+ outputs = (out2,) + attn_outputs[1:]
+ return outputs
+
+
+class CTRLPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = CTRLConfig
+ base_model_prefix = "transformer"
+
+ def _init_weights(self, module):
+ """Initialize the weights."""
+ if isinstance(module, (nn.Linear, Conv1D)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+
+CTRL_START_DOCSTRING = r"""
+
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`CTRLConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CTRL_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
+ (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+ If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
+ `input_ids`.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+ [`PreTrainedTokenizer.encode`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ past_key_values (`Tuple[Tuple[torch.FloatTensor]]` of length `config.n_layers`):
+ Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+ `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+ their past given to this model should not be passed as input ids as they have already been computed.
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
+
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
+
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+ CTRL_START_DOCSTRING,
+)
+class CTRLModel(CTRLPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.d_model_size = config.n_embd
+ self.num_layers = config.n_layer
+
+ self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float)
+
+ self.w = nn.Embedding(config.vocab_size, config.n_embd)
+
+ self.dropout = nn.Dropout(config.embd_pdrop)
+ self.h = nn.ModuleList(
+ [EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop) for _ in range(config.n_layer)]
+ )
+ self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.w
+
+ def set_input_embeddings(self, new_embeddings):
+ self.w = new_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+ """
+ for layer, heads in heads_to_prune.items():
+ self.h[layer].multi_head_attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BaseModelOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs, # NOOP kwargs, for now
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPast]:
+ r"""
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, CTRLModel
+ >>> import torch
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+ >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")
+
+ >>> # CTRL was trained with control codes as the first token
+ >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
+ >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()
+
+ >>> outputs = model(**inputs)
+
+ >>> last_hidden_states = outputs.last_hidden_state
+ >>> list(last_hidden_states.shape)
+ [1, 5, 1280]
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+ input_shape = input_ids.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+ batch_size = input_ids.shape[0]
+ elif inputs_embeds is not None:
+ input_shape = inputs_embeds.size()[:-1]
+ batch_size = inputs_embeds.shape[0]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+ if past_key_values is None:
+ past_length = 0
+ past_key_values = tuple([None] * len(self.h))
+ else:
+ past_length = past_key_values[0][0].size(-2)
+ if position_ids is None:
+ position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+ position_ids = position_ids.unsqueeze(0)
+
+ # Attention mask.
+ if attention_mask is not None:
+ if batch_size <= 0:
+ raise ValueError("batch_size has to be defined and > 0")
+ attention_mask = attention_mask.view(batch_size, -1)
+ # We create a 3D attention mask from a 2D tensor mask.
+ # Sizes are [batch_size, 1, 1, to_seq_length]
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+ # this attention mask is more simple than the triangular masking of causal attention
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+ attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and the dtype's smallest value for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
+ attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+ # Prepare head mask if needed
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+ if token_type_ids is not None:
+ token_type_ids = token_type_ids.view(-1, input_shape[-1])
+ token_type_embeds = self.w(token_type_ids)
+ token_type_embeds *= np.sqrt(self.d_model_size)
+ else:
+ token_type_embeds = 0
+
+ if inputs_embeds is None:
+ inputs_embeds = self.w(input_ids)
+ # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
+ seq_len = input_shape[-1]
+ mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(device)
+
+ inputs_embeds *= np.sqrt(self.d_model_size)
+
+ # `self.pos_encoding` won't be sent to the correct device along the model, so we do it manually.
+ self.pos_encoding = self.pos_encoding.to(device)
+ pos_embeds = self.pos_encoding[position_ids, :]
+
+ hidden_states = inputs_embeds + pos_embeds + token_type_embeds
+
+ hidden_states = self.dropout(hidden_states)
+
+ presents = () if use_cache else None
+ all_hidden_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+ for i, (h, layer_past) in enumerate(zip(self.h, past_key_values)):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+ outputs = h(
+ hidden_states,
+ mask,
+ layer_past=layer_past,
+ attention_mask=attention_mask,
+ head_mask=head_mask[i],
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ )
+ hidden_states, present = outputs[:2]
+ if use_cache is True:
+ presents = presents + (present,)
+
+ if output_attentions:
+ all_attentions += (outputs[2],)
+
+ hidden_states = self.layernorm(hidden_states)
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
+
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=presents,
+ hidden_states=all_hidden_states,
+ attentions=all_attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
+ embeddings).
+ """,
+ CTRL_START_DOCSTRING,
+)
+class CTRLLMHeadModel(CTRLPreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.transformer = CTRLModel(config)
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+ are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> import torch
+ >>> from transformers import AutoTokenizer, CTRLLMHeadModel
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+ >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")
+
+ >>> # CTRL was trained with control codes as the first token
+ >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
+ >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()
+
+ >>> sequence_ids = model.generate(inputs["input_ids"])
+ >>> sequences = tokenizer.batch_decode(sequence_ids)
+ >>> sequences
+ ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']
+
+ >>> outputs = model(**inputs, labels=inputs["input_ids"])
+ >>> round(outputs.loss.item(), 2)
+ 9.21
+
+ >>> list(outputs.logits.shape)
+ [1, 5, 246534]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.transformer(
+ input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = transformer_outputs[0]
+
+ lm_logits = self.lm_head(hidden_states)
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(
+ lm_logits,
+ labels,
+ vocab_size=self.config.vocab_size,
+ **kwargs,
+ )
+
+ if not return_dict:
+ output = (lm_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=lm_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs):
+ # Overwritten -- inputs_embeds not working properly
+
+ # only last tokens for inputs_ids if past is defined in kwargs
+ if past_key_values is not None:
+ past_length = past_key_values[0][0].shape[2]
+
+ # Some generation methods already pass only the last input ID
+ if input_ids.shape[1] > past_length:
+ remove_prefix_length = past_length
+ else:
+ # Default to old behavior: keep only final ID
+ remove_prefix_length = input_ids.shape[1] - 1
+
+ input_ids = input_ids[:, remove_prefix_length:]
+
+ return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache}
+
+ @staticmethod
+ def _reorder_cache(
+ past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+ ) -> Tuple[Tuple[torch.Tensor]]:
+ """
+ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+ [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+ beam_idx at every generation step.
+ """
+ return tuple(
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+ for layer_past in past_key_values
+ )
+
+
+@add_start_docstrings(
+ """
+ The CTRL Model transformer with a sequence classification head on top (linear layer).
+ [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+ (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
+ token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
+ each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
+ guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
+ value in each row of the batch).
+ """,
+ CTRL_START_DOCSTRING,
+)
+class CTRLForSequenceClassification(CTRLPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.transformer = CTRLModel(config)
+ self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+ Returns:
+
+ Example of single-label classification:
+
+ ```python
+ >>> import torch
+ >>> from transformers import AutoTokenizer, CTRLForSequenceClassification
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+ >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")
+
+ >>> # CTRL was trained with control codes as the first token
+ >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
+ >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()
+
+ >>> with torch.no_grad():
+ ... logits = model(**inputs).logits
+
+ >>> predicted_class_id = logits.argmax().item()
+ >>> model.config.id2label[predicted_class_id]
+ 'LABEL_0'
+ ```
+
+ ```python
+ >>> import torch
+
+ >>> torch.manual_seed(42) # doctest: +IGNORE_RESULT
+ >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
+ >>> num_labels = len(model.config.id2label)
+ >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)
+
+ >>> labels = torch.tensor(1)
+ >>> loss = model(**inputs, labels=labels).loss
+ >>> round(loss.item(), 2)
+ 0.93
+ ```
+
+ Example of multi-label classification:
+
+ ```python
+ >>> import torch
+ >>> from transformers import AutoTokenizer, CTRLForSequenceClassification
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+ >>> model = CTRLForSequenceClassification.from_pretrained(
+ ... "Salesforce/ctrl", problem_type="multi_label_classification"
+ ... )
+
+ >>> # CTRL was trained with control codes as the first token
+ >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
+ >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()
+
+ >>> with torch.no_grad():
+ ... logits = model(**inputs).logits
+
+ >>> predicted_class_id = logits.argmax().item()
+ >>> model.config.id2label[predicted_class_id]
+ 'LABEL_0'
+ ```
+
+ ```python
+ >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
+ >>> num_labels = len(model.config.id2label)
+ >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)
+
+ >>> num_labels = len(model.config.id2label)
+ >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
+ ... torch.float
+ ... )
+ >>> loss = model(**inputs, labels=labels).loss
+ >>> loss.backward() # doctest: +IGNORE_RESULT
+ ```"""
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.transformer(
+ input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = transformer_outputs[0]
+ logits = self.classifier(hidden_states)
+
+ if input_ids is not None:
+ batch_size, sequence_length = input_ids.shape[:2]
+ else:
+ batch_size, sequence_length = inputs_embeds.shape[:2]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ last_non_pad_token = -1
+ elif input_ids is not None:
+ # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+ non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+ token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+ last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+ else:
+ last_non_pad_token = -1
+ logger.warning_once(
+ f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+ "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+ )
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+ loss = None
+ if labels is not None:
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutput(
+ loss=loss,
+ logits=pooled_logits,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
+
+
+__all__ = ["CTRLForSequenceClassification", "CTRLLMHeadModel", "CTRLModel", "CTRLPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/ctrl/modeling_tf_ctrl.py b/docs/transformers/build/lib/transformers/models/ctrl/modeling_tf_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f35076ff9496c5d96f7265a72f2db0c729fb82e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -0,0 +1,922 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 CTRL model."""
+
+from __future__ import annotations
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast, TFSequenceClassifierOutput
+from ...modeling_tf_utils import (
+ TFCausalLanguageModelingLoss,
+ TFModelInputType,
+ TFPreTrainedModel,
+ TFSequenceClassificationLoss,
+ get_initializer,
+ keras,
+ keras_serializable,
+ unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_ctrl import CTRLConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Salesforce/ctrl"
+_CONFIG_FOR_DOC = "CTRLConfig"
+
+
+def angle_defn(pos, i, d_model_size):
+ angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model_size)
+ return pos * angle_rates
+
+
+def positional_encoding(position, d_model_size):
+ # create the sinusoidal pattern for the positional encoding
+ angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)
+
+ sines = np.sin(angle_rads[:, 0::2])
+ cosines = np.cos(angle_rads[:, 1::2])
+ pos_encoding = tf.convert_to_tensor(np.concatenate([sines, cosines], axis=-1))
+
+ return pos_encoding
+
+
+def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
+ # calculate attention
+ matmul_qk = tf.matmul(q, k, transpose_b=True)
+
+ dk = tf.cast(shape_list(k)[-1], dtype=matmul_qk.dtype)
+ scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
+
+ if mask is not None:
+ scaled_attention_logits += tf.cast(mask * -1e4, dtype=scaled_attention_logits.dtype)
+
+ if attention_mask is not None:
+ # Apply the attention mask
+ attention_mask = tf.cast(attention_mask, dtype=scaled_attention_logits.dtype)
+ scaled_attention_logits = scaled_attention_logits + attention_mask
+
+ attention_weights = stable_softmax(scaled_attention_logits, axis=-1)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_weights = attention_weights * head_mask
+
+ output = tf.matmul(attention_weights, v)
+
+ return output, attention_weights
+
+
+class TFMultiHeadAttention(keras.layers.Layer):
+ def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
+ super().__init__(**kwargs)
+ self.num_heads = num_heads
+ self.d_model_size = d_model_size
+ self.output_attentions = output_attentions
+
+ self.depth = int(d_model_size / self.num_heads)
+
+ self.Wq = keras.layers.Dense(d_model_size, name="Wq")
+ self.Wk = keras.layers.Dense(d_model_size, name="Wk")
+ self.Wv = keras.layers.Dense(d_model_size, name="Wv")
+
+ self.dense = keras.layers.Dense(d_model_size, name="dense")
+
+ def split_into_heads(self, x, batch_size):
+ x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
+ return tf.transpose(x, perm=[0, 2, 1, 3])
+
+ def call(self, v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
+ batch_size = shape_list(q)[0]
+
+ q = self.Wq(q)
+ k = self.Wk(k)
+ v = self.Wv(v)
+
+ q = self.split_into_heads(q, batch_size)
+ k = self.split_into_heads(k, batch_size)
+ v = self.split_into_heads(v, batch_size)
+
+ if layer_past is not None:
+ past_key, past_value = tf.unstack(layer_past, axis=0)
+ k = tf.concat((past_key, k), axis=-2)
+ v = tf.concat((past_value, v), axis=-2)
+
+ if use_cache:
+ present = tf.stack((k, v), axis=0)
+ else:
+ present = (None,)
+
+ output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
+ scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3])
+ attn = output[1]
+ original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size))
+ output = self.dense(original_size_attention)
+ outputs = (output, present)
+
+ if output_attentions:
+ outputs = outputs + (attn,)
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "Wq", None) is not None:
+ with tf.name_scope(self.Wq.name):
+ self.Wq.build([None, None, self.d_model_size])
+ if getattr(self, "Wk", None) is not None:
+ with tf.name_scope(self.Wk.name):
+ self.Wk.build([None, None, self.d_model_size])
+ if getattr(self, "Wv", None) is not None:
+ with tf.name_scope(self.Wv.name):
+ self.Wv.build([None, None, self.d_model_size])
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.d_model_size])
+
+
+class TFPointWiseFeedForwardLayer(keras.layers.Layer):
+ def __init__(self, d_model_size, dff, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense_0 = keras.layers.Dense(dff, activation="relu", name="0")
+ self.dense_2 = keras.layers.Dense(d_model_size, name="2")
+ self.d_model_size = d_model_size
+ self.dff = dff
+
+ def call(self, inputs, trainable=False):
+ dense_0_output = self.dense_0(inputs)
+ dense_2_output = self.dense_2(dense_0_output)
+
+ return dense_2_output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense_0", None) is not None:
+ with tf.name_scope(self.dense_0.name):
+ self.dense_0.build([None, None, self.d_model_size])
+ if getattr(self, "dense_2", None) is not None:
+ with tf.name_scope(self.dense_2.name):
+ self.dense_2.build([None, None, self.dff])
+
+
+class TFEncoderLayer(keras.layers.Layer):
+ def __init__(
+ self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
+ ):
+ super().__init__(**kwargs)
+
+ self.output_attentions = output_attentions
+
+ self.multi_head_attention = TFMultiHeadAttention(
+ d_model_size, num_heads, output_attentions=self.output_attentions, name="multi_head_attention"
+ )
+ self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn")
+
+ self.layernorm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
+ self.layernorm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2")
+
+ self.dropout1 = keras.layers.Dropout(rate)
+ self.dropout2 = keras.layers.Dropout(rate)
+ self.d_model_size = d_model_size
+
+ def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
+ normed = self.layernorm1(x)
+ attn_outputs = self.multi_head_attention(
+ normed,
+ normed,
+ normed,
+ mask,
+ layer_past,
+ attention_mask,
+ head_mask,
+ use_cache,
+ output_attentions,
+ training=training,
+ )
+ attn_output = attn_outputs[0]
+ attn_output = self.dropout1(attn_output, training=training)
+ out1 = x + attn_output
+
+ out2 = self.layernorm2(out1)
+ ffn_output = self.ffn(out2)
+ ffn_output = self.dropout2(ffn_output, training=training)
+ out2 = out1 + ffn_output
+
+ outputs = (out2,) + attn_outputs[1:]
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "multi_head_attention", None) is not None:
+ with tf.name_scope(self.multi_head_attention.name):
+ self.multi_head_attention.build(None)
+ if getattr(self, "ffn", None) is not None:
+ with tf.name_scope(self.ffn.name):
+ self.ffn.build(None)
+ if getattr(self, "layernorm1", None) is not None:
+ with tf.name_scope(self.layernorm1.name):
+ self.layernorm1.build([None, None, self.d_model_size])
+ if getattr(self, "layernorm2", None) is not None:
+ with tf.name_scope(self.layernorm2.name):
+ self.layernorm2.build([None, None, self.d_model_size])
+
+
+@keras_serializable
+class TFCTRLMainLayer(keras.layers.Layer):
+ config_class = CTRLConfig
+
+ def __init__(self, config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.config = config
+ self.output_hidden_states = config.output_hidden_states
+ self.output_attentions = config.output_attentions
+ self.use_cache = config.use_cache
+ self.return_dict = config.use_return_dict
+
+ self.d_model_size = config.n_embd
+ self.num_layers = config.n_layer
+
+ self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
+
+ self.w = keras.layers.Embedding(
+ input_dim=config.vocab_size,
+ output_dim=config.n_embd,
+ embeddings_initializer=get_initializer(config.initializer_range),
+ name="w",
+ )
+
+ self.dropout = keras.layers.Dropout(config.embd_pdrop)
+ self.h = [
+ TFEncoderLayer(
+ config.n_embd,
+ config.n_head,
+ config.dff,
+ config.resid_pdrop,
+ config.layer_norm_epsilon,
+ self.output_attentions,
+ name=f"h_._{i}",
+ )
+ for i in range(config.n_layer)
+ ]
+ self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
+
+ def get_input_embeddings(self):
+ return self.w
+
+ def set_input_embeddings(self, new_embeddings):
+ self.w = new_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+ """
+ raise NotImplementedError
+
+ @unpack_inputs
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ head_mask: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[Tuple, TFBaseModelOutputWithPast]:
+ # If using past key value states, only the last tokens
+ # should be given as an input
+ if past_key_values is not None:
+ if input_ids is not None:
+ input_ids = input_ids[:, -1:]
+ if inputs_embeds is not None:
+ inputs_embeds = inputs_embeds[:, -1:]
+ if token_type_ids is not None:
+ token_type_ids = token_type_ids[:, -1:]
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ input_shape = shape_list(input_ids)
+ input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
+ elif inputs_embeds is not None:
+ input_shape = shape_list(inputs_embeds)[:-1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if past_key_values is None:
+ past_length = 0
+ past_key_values = [None] * len(self.h)
+ else:
+ past_length = shape_list(past_key_values[0][0])[-2]
+ if position_ids is None:
+ position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32), axis=0)
+ position_ids = tf.tile(position_ids, [input_shape[0], 1])
+
+ # Attention mask.
+ if attention_mask is not None:
+ # We create a 3D attention mask from a 2D tensor mask.
+ # Sizes are [batch_size, 1, 1, to_seq_length]
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+ # this attention mask is more simple than the triangular masking of causal attention
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+ attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1] + past_length))
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and -10000.0 for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+
+ one_cst = tf.constant(1.0)
+ ten_thousand_cst = tf.constant(-10000.0)
+ attention_mask = tf.cast(attention_mask, dtype=one_cst.dtype)
+ attention_mask = tf.multiply(tf.subtract(one_cst, attention_mask), ten_thousand_cst)
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # head_mask has shape n_layer x batch x n_heads x N x N
+ if head_mask is not None:
+ raise NotImplementedError
+ else:
+ head_mask = [None] * self.num_layers
+
+ if token_type_ids is not None:
+ token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
+ token_type_embeds = self.w(token_type_ids)
+ token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, dtype=token_type_embeds.dtype))
+ else:
+ token_type_embeds = tf.constant(0.0)
+ position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+
+ if inputs_embeds is None:
+ check_embeddings_within_bounds(input_ids, self.w.input_dim)
+ inputs_embeds = self.w(input_ids)
+ seq_len = input_shape[-1]
+ mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
+
+ inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, inputs_embeds.dtype))
+
+ pos_embeds = tf.gather(self.pos_encoding, position_ids)
+ pos_embeds = tf.cast(pos_embeds, dtype=token_type_embeds.dtype)
+ hidden_states = inputs_embeds + pos_embeds + token_type_embeds
+
+ hidden_states = self.dropout(hidden_states, training=training)
+
+ output_shape = input_shape + [shape_list(hidden_states)[-1]]
+ presents = () if use_cache else None
+ all_hidden_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+ for i, (h, layer_past) in enumerate(zip(self.h, past_key_values)):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
+ outputs = h(
+ hidden_states,
+ mask,
+ layer_past,
+ attention_mask,
+ head_mask[i],
+ use_cache,
+ output_attentions,
+ training=training,
+ )
+ hidden_states, present = outputs[:2]
+
+ if use_cache:
+ presents = presents + (present,)
+
+ if output_attentions:
+ all_attentions = all_attentions + (outputs[2],)
+
+ hidden_states = self.layernorm(hidden_states)
+ hidden_states = tf.reshape(hidden_states, output_shape)
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if output_attentions:
+ # let the number of heads free (-1) so we can extract attention even after head pruning
+ attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
+ all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
+
+ return TFBaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=presents,
+ hidden_states=all_hidden_states,
+ attentions=all_attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "w", None) is not None:
+ with tf.name_scope(self.w.name):
+ self.w.build(None)
+ if getattr(self, "layernorm", None) is not None:
+ with tf.name_scope(self.layernorm.name):
+ self.layernorm.build([None, None, self.config.n_embd])
+ if getattr(self, "h", None) is not None:
+ for layer in self.h:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+class TFCTRLPreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = CTRLConfig
+ base_model_prefix = "transformer"
+
+
+CTRL_START_DOCSTRING = r"""
+
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+ as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+ behavior.
+
+
+
+ TensorFlow models and layers in `transformers` accept two formats as input:
+
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional argument.
+
+ The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+ and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+ pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+ format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+ the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+ positional argument:
+
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+ Note that when creating models and layers with
+ [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+ about any of this, as you can just pass inputs like you would to any other Python function!
+
+
+
+ Parameters:
+ config ([`CTRLConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CTRL_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`):
+ `input_ids_length` = `sequence_length` if `past` is `None` else `past[0].shape[-2]` (`sequence_length` of
+ input past key value states).
+
+ Indices of input sequence tokens in the vocabulary.
+
+ If `past` is used, only input IDs that do not have their past calculated should be passed as `input_ids`.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+ [`PreTrainedTokenizer.encode`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ past (`List[tf.Tensor]` of length `config.n_layers`):
+ Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+ `past` output below). Can be used to speed up sequential decoding. The token ids which have their past
+ given to this model should not be passed as input ids as they have already been computed.
+ attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
+
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
+
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past` key value states are returned and can be used to speed up decoding (see `past`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+ config will be used instead.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+ used instead.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+ eager mode, in graph mode the value will always be set to True.
+ training (`bool`, *optional*, defaults to `False`):
+ Whether or not to use the model in training mode (some modules like dropout modules have different
+ behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+ "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+ CTRL_START_DOCSTRING,
+)
+class TFCTRLModel(TFCTRLPreTrainedModel):
+ def __init__(self, config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+ self.transformer = TFCTRLMainLayer(config, name="transformer")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFBaseModelOutputWithPast,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ head_mask: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[Tuple, TFBaseModelOutputWithPast]:
+ outputs = self.transformer(
+ input_ids=input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "transformer", None) is not None:
+ with tf.name_scope(self.transformer.name):
+ self.transformer.build(None)
+
+
+class TFCTRLBiasLayer(keras.layers.Layer):
+ """
+ Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
+ so all weights have to be registered in a layer.
+ """
+
+ def __init__(self, shape, initializer, trainable, name, **kwargs):
+ super().__init__(name=name, **kwargs)
+ self.shape = shape
+ self.initializer = initializer
+ self.trainable = trainable
+
+ def build(self, input_shape):
+ self.bias = self.add_weight(
+ name="bias", shape=self.shape, initializer=self.initializer, trainable=self.trainable
+ )
+ super().build(input_shape)
+
+ def call(self, x):
+ return x + self.bias
+
+
+@add_start_docstrings(
+ """
+ The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
+ embeddings).
+ """,
+ CTRL_START_DOCSTRING,
+)
+class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
+ def __init__(self, config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+ self.transformer = TFCTRLMainLayer(config, name="transformer")
+ self.bias_layer = TFCTRLBiasLayer(
+ name="lm_head", shape=[1, config.vocab_size], initializer="zeros", trainable=True
+ )
+
+ def get_output_embeddings(self):
+ return self.get_input_embeddings()
+
+ def set_output_embeddings(self, value):
+ self.set_input_embeddings(value)
+
+ def get_bias(self):
+ return {"lm_head.bias": self.bias_layer.bias}
+
+ def set_bias(self, value):
+ # Replaces the existing layers containing bias for correct (de)serialization.
+ vocab_size = value["lm_head.bias"].shape[-1]
+ self.bias_layer = TFCTRLBiasLayer(
+ name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=True
+ )
+ self.bias_layer.build(None)
+ self.bias_layer.bias.assign(value["lm_head.bias"])
+
+ # Copied from transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel.prepare_inputs_for_generation
+ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
+ token_type_ids = kwargs.get("token_type_ids", None)
+ # only last token for inputs_ids if past is defined in kwargs
+ if past_key_values:
+ inputs = tf.expand_dims(inputs[:, -1], -1)
+ if token_type_ids is not None:
+ token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
+
+ position_ids = kwargs.get("position_ids", None)
+ attention_mask = kwargs.get("attention_mask", None)
+
+ if attention_mask is not None and position_ids is None:
+ position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+ if past_key_values:
+ position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+ return {
+ "input_ids": inputs,
+ "attention_mask": attention_mask,
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": use_cache,
+ "token_type_ids": token_type_ids,
+ }
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFCausalLMOutputWithPast,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ head_mask: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[Tuple, TFCausalLMOutputWithPast]:
+ r"""
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+ config.vocab_size - 1]`.
+ """
+ transformer_outputs = self.transformer(
+ input_ids=input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = tf.matmul(hidden_states, self.transformer.w.weights, transpose_b=True)
+ logits = self.bias_layer(logits)
+
+ loss = None
+ if labels is not None:
+ # shift labels to the left and cut last logit token
+ shifted_logits = logits[:, :-1]
+ labels = labels[:, 1:]
+ loss = self.hf_compute_loss(labels, shifted_logits)
+
+ if not return_dict:
+ output = (logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFCausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "transformer", None) is not None:
+ with tf.name_scope(self.transformer.name):
+ self.transformer.build(None)
+ if getattr(self, "bias_layer", None) is not None:
+ with tf.name_scope(self.bias_layer.name):
+ self.bias_layer.build(None)
+
+
+@add_start_docstrings(
+ """
+ The CTRL Model transformer with a sequence classification head on top (linear layer).
+
+ [`TFCTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+ (e.g. GPT-1, GPT-2) do.
+
+ Since it does classification on the last token, it requires to know the position of the last token. If a
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+ each row of the batch).
+ """,
+ CTRL_START_DOCSTRING,
+)
+class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassificationLoss):
+ def __init__(self, config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+ self.num_labels = config.num_labels
+ self.classifier = keras.layers.Dense(
+ config.num_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="classifier",
+ use_bias=False,
+ )
+ self.transformer = TFCTRLMainLayer(config, name="transformer")
+ self.config = config
+
+ def get_output_embeddings(self):
+ # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
+ logger.warning(
+ "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
+ "in transformers v4.32."
+ )
+ return self.transformer.w
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFSequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ head_mask: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[Tuple, TFSequenceClassifierOutput]:
+ r"""
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+ config.vocab_size - 1]`.
+ """
+
+ transformer_outputs = self.transformer(
+ input_ids=input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.classifier(hidden_states)
+ logits_shape = shape_list(logits)
+ batch_size = logits_shape[0]
+
+ if self.config.pad_token_id is None:
+ last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+ else:
+ if input_ids is not None:
+ token_indices = tf.range(shape_list(input_ids)[-1])
+ non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype)
+ last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1)
+ else:
+ last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+ logger.warning_once(
+ f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+ "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+ )
+ loss = None
+
+ pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1)
+
+ if labels is not None:
+ if self.config.pad_token_id is None and logits_shape[0] != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+
+ loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels]))
+
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFSequenceClassifierOutput(
+ loss=loss,
+ logits=pooled_logits,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.config.n_embd])
+ if getattr(self, "transformer", None) is not None:
+ with tf.name_scope(self.transformer.name):
+ self.transformer.build(None)
+
+
+__all__ = ["TFCTRLForSequenceClassification", "TFCTRLLMHeadModel", "TFCTRLModel", "TFCTRLPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/ctrl/tokenization_ctrl.py b/docs/transformers/build/lib/transformers/models/ctrl/tokenization_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..66dae2b05fa620152757fc4df3b64e2ad7a3bea5
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/ctrl/tokenization_ctrl.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Salesforce CTRL."""
+
+import json
+import os
+from typing import Optional, Tuple
+
+import regex as re
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+ "vocab_file": "vocab.json",
+ "merges_file": "merges.txt",
+}
+
+
+CONTROL_CODES = {
+ "Pregnancy": 168629,
+ "Christianity": 7675,
+ "Explain": 106423,
+ "Fitness": 63440,
+ "Saving": 63163,
+ "Ask": 27171,
+ "Ass": 95985,
+ "Joke": 163509,
+ "Questions": 45622,
+ "Thoughts": 49605,
+ "Retail": 52342,
+ "Feminism": 164338,
+ "Writing": 11992,
+ "Atheism": 192263,
+ "Netflix": 48616,
+ "Computing": 39639,
+ "Opinion": 43213,
+ "Alone": 44967,
+ "Funny": 58917,
+ "Gaming": 40358,
+ "Human": 4088,
+ "India": 1331,
+ "Joker": 77138,
+ "Diet": 36206,
+ "Legal": 11859,
+ "Norman": 4939,
+ "Tip": 72689,
+ "Weight": 52343,
+ "Movies": 46273,
+ "Running": 23425,
+ "Science": 2090,
+ "Horror": 37793,
+ "Confession": 60572,
+ "Finance": 12250,
+ "Politics": 16360,
+ "Scary": 191985,
+ "Support": 12654,
+ "Technologies": 32516,
+ "Teenage": 66160,
+ "Event": 32769,
+ "Learned": 67460,
+ "Notion": 182770,
+ "Wikipedia": 37583,
+ "Books": 6665,
+ "Extract": 76050,
+ "Confessions": 102701,
+ "Conspiracy": 75932,
+ "Links": 63674,
+ "Narcissus": 150425,
+ "Relationship": 54766,
+ "Relationships": 134796,
+ "Reviews": 41671,
+ "News": 4256,
+ "Translation": 26820,
+ "multilingual": 128406,
+}
+
+
+def get_pairs(word):
+ """
+ Return set of symbol pairs in a word.
+
+ Word is represented as tuple of symbols (symbols being variable-length strings).
+ """
+ pairs = set()
+ prev_char = word[0]
+ for char in word[1:]:
+ pairs.add((prev_char, char))
+ prev_char = char
+
+ pairs = set(pairs)
+ return pairs
+
+
+class CTRLTokenizer(PreTrainedTokenizer):
+ """
+ Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+ this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ unk_token (`str`, *optional*, defaults to `""`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ control_codes = CONTROL_CODES
+
+ def __init__(self, vocab_file, merges_file, unk_token="", **kwargs):
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
+ self.encoder = json.load(vocab_handle)
+ self.decoder = {v: k for k, v in self.encoder.items()}
+ with open(merges_file, encoding="utf-8") as merges_handle:
+ merges = merges_handle.read().split("\n")[1:-1]
+ merges = [tuple(merge.split()) for merge in merges]
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
+ self.cache = {}
+ super().__init__(unk_token=unk_token, **kwargs)
+
+ @property
+ def vocab_size(self):
+ return len(self.encoder)
+
+ def get_vocab(self):
+ return dict(self.encoder, **self.added_tokens_encoder)
+
+ def bpe(self, token):
+ if token in self.cache:
+ return self.cache[token]
+ word = tuple(token)
+ word = tuple(list(word[:-1]) + [word[-1] + ""])
+ pairs = get_pairs(word)
+
+ if not pairs:
+ return token
+
+ while True:
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+ if bigram not in self.bpe_ranks:
+ break
+ first, second = bigram
+ new_word = []
+ i = 0
+ while i < len(word):
+ try:
+ j = word.index(first, i)
+ except ValueError:
+ new_word.extend(word[i:])
+ break
+ else:
+ new_word.extend(word[i:j])
+ i = j
+
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+ new_word.append(first + second)
+ i += 2
+ else:
+ new_word.append(word[i])
+ i += 1
+ new_word = tuple(new_word)
+ word = new_word
+ if len(word) == 1:
+ break
+ else:
+ pairs = get_pairs(word)
+ word = "@@ ".join(word)
+ word = word[:-4]
+ self.cache[token] = word
+ return word
+
+ def _tokenize(self, text):
+ """Tokenize a string."""
+ split_tokens = []
+
+ words = re.findall(r"\S+\n?", text)
+
+ for token in words:
+ split_tokens.extend(list(self.bpe(token).split(" ")))
+ return split_tokens
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ return self.decoder.get(index, self.unk_token)
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ out_string = " ".join(tokens).replace("@@ ", "").strip()
+ return out_string
+
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+ merge_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+ )
+
+ with open(vocab_file, "w", encoding="utf-8") as f:
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+ index = 0
+ with open(merge_file, "w", encoding="utf-8") as writer:
+ writer.write("#version: 0.2\n")
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+ if index != token_index:
+ logger.warning(
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+ " Please check that the tokenizer is not corrupted!"
+ )
+ index = token_index
+ writer.write(" ".join(bpe_tokens) + "\n")
+ index += 1
+
+ return vocab_file, merge_file
+
+ # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+ # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
+ # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
+ # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
+ # return ''.join(tokens_generated_so_far)
+
+
+__all__ = ["CTRLTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/cvt/__init__.py b/docs/transformers/build/lib/transformers/models/cvt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..756aded9e6ad2964ffa5add89af2c4af56071e88
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/cvt/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_cvt import *
+ from .modeling_cvt import *
+ from .modeling_tf_cvt import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/cvt/configuration_cvt.py b/docs/transformers/build/lib/transformers/models/cvt/configuration_cvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..38cba6874f6860e1270aa4aea8166df309b4f014
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/cvt/configuration_cvt.py
@@ -0,0 +1,146 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CvT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CvtConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`CvtModel`]. It is used to instantiate a CvT model
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the CvT
+ [microsoft/cvt-13](https://huggingface.co/microsoft/cvt-13) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ patch_sizes (`List[int]`, *optional*, defaults to `[7, 3, 3]`):
+ The kernel size of each encoder's patch embedding.
+ patch_stride (`List[int]`, *optional*, defaults to `[4, 2, 2]`):
+ The stride size of each encoder's patch embedding.
+ patch_padding (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
+ The padding size of each encoder's patch embedding.
+ embed_dim (`List[int]`, *optional*, defaults to `[64, 192, 384]`):
+ Dimension of each of the encoder blocks.
+ num_heads (`List[int]`, *optional*, defaults to `[1, 3, 6]`):
+ Number of attention heads for each attention layer in each block of the Transformer encoder.
+ depth (`List[int]`, *optional*, defaults to `[1, 2, 10]`):
+ The number of layers in each encoder block.
+ mlp_ratios (`List[float]`, *optional*, defaults to `[4.0, 4.0, 4.0, 4.0]`):
+ Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+ encoder blocks.
+ attention_drop_rate (`List[float]`, *optional*, defaults to `[0.0, 0.0, 0.0]`):
+ The dropout ratio for the attention probabilities.
+ drop_rate (`List[float]`, *optional*, defaults to `[0.0, 0.0, 0.0]`):
+ The dropout ratio for the patch embeddings probabilities.
+ drop_path_rate (`List[float]`, *optional*, defaults to `[0.0, 0.0, 0.1]`):
+ The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
+ qkv_bias (`List[bool]`, *optional*, defaults to `[True, True, True]`):
+ The bias bool for query, key and value in attentions
+ cls_token (`List[bool]`, *optional*, defaults to `[False, False, True]`):
+ Whether or not to add a classification token to the output of each of the last 3 stages.
+ qkv_projection_method (`List[string]`, *optional*, defaults to ["dw_bn", "dw_bn", "dw_bn"]`):
+ The projection method for query, key and value Default is depth-wise convolutions with batch norm. For
+ Linear projection use "avg".
+ kernel_qkv (`List[int]`, *optional*, defaults to `[3, 3, 3]`):
+ The kernel size for query, key and value in attention layer
+ padding_kv (`List[int]`, *optional*, defaults to `[1, 1, 1]`):
+ The padding size for key and value in attention layer
+ stride_kv (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
+ The stride size for key and value in attention layer
+ padding_q (`List[int]`, *optional*, defaults to `[1, 1, 1]`):
+ The padding size for query in attention layer
+ stride_q (`List[int]`, *optional*, defaults to `[1, 1, 1]`):
+ The stride size for query in attention layer
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+ The epsilon used by the layer normalization layers.
+
+ Example:
+
+ ```python
+ >>> from transformers import CvtConfig, CvtModel
+
+ >>> # Initializing a Cvt msft/cvt style configuration
+ >>> configuration = CvtConfig()
+
+ >>> # Initializing a model (with random weights) from the msft/cvt style configuration
+ >>> model = CvtModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "cvt"
+
+ def __init__(
+ self,
+ num_channels=3,
+ patch_sizes=[7, 3, 3],
+ patch_stride=[4, 2, 2],
+ patch_padding=[2, 1, 1],
+ embed_dim=[64, 192, 384],
+ num_heads=[1, 3, 6],
+ depth=[1, 2, 10],
+ mlp_ratio=[4.0, 4.0, 4.0],
+ attention_drop_rate=[0.0, 0.0, 0.0],
+ drop_rate=[0.0, 0.0, 0.0],
+ drop_path_rate=[0.0, 0.0, 0.1],
+ qkv_bias=[True, True, True],
+ cls_token=[False, False, True],
+ qkv_projection_method=["dw_bn", "dw_bn", "dw_bn"],
+ kernel_qkv=[3, 3, 3],
+ padding_kv=[1, 1, 1],
+ stride_kv=[2, 2, 2],
+ padding_q=[1, 1, 1],
+ stride_q=[1, 1, 1],
+ initializer_range=0.02,
+ layer_norm_eps=1e-12,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.num_channels = num_channels
+ self.patch_sizes = patch_sizes
+ self.patch_stride = patch_stride
+ self.patch_padding = patch_padding
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.depth = depth
+ self.mlp_ratio = mlp_ratio
+ self.attention_drop_rate = attention_drop_rate
+ self.drop_rate = drop_rate
+ self.drop_path_rate = drop_path_rate
+ self.qkv_bias = qkv_bias
+ self.cls_token = cls_token
+ self.qkv_projection_method = qkv_projection_method
+ self.kernel_qkv = kernel_qkv
+ self.padding_kv = padding_kv
+ self.stride_kv = stride_kv
+ self.padding_q = padding_q
+ self.stride_q = stride_q
+ self.initializer_range = initializer_range
+ self.layer_norm_eps = layer_norm_eps
+
+
+__all__ = ["CvtConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d39777680b149c55f7512cc428027e2539b98eb1
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,362 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert CvT checkpoints from the original repository.
+
+URL: https://github.com/microsoft/CvT"""
+
+import argparse
+import json
+from collections import OrderedDict
+from pathlib import Path
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers import AutoImageProcessor, CvtConfig, CvtForImageClassification
+
+
+def embeddings(idx):
+ """
+ The function helps in renaming embedding layer weights.
+
+ Args:
+ idx: stage number in original model
+ """
+ embed = []
+ embed.append(
+ (
+ f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.weight",
+ f"stage{idx}.patch_embed.proj.weight",
+ )
+ )
+ embed.append(
+ (
+ f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.bias",
+ f"stage{idx}.patch_embed.proj.bias",
+ )
+ )
+ embed.append(
+ (
+ f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.weight",
+ f"stage{idx}.patch_embed.norm.weight",
+ )
+ )
+ embed.append(
+ (
+ f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.bias",
+ f"stage{idx}.patch_embed.norm.bias",
+ )
+ )
+ return embed
+
+
+def attention(idx, cnt):
+ """
+ The function helps in renaming attention block layers weights.
+
+ Args:
+ idx: stage number in original model
+ cnt: count of blocks in each stage
+ """
+ attention_weights = []
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.convolution.weight",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.conv.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.weight",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.bias",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.bias",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_mean",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_mean",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_var",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_var",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.num_batches_tracked",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.num_batches_tracked",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.convolution.weight",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.conv.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.weight",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.bias",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.bias",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_mean",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_mean",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_var",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_var",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.num_batches_tracked",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.num_batches_tracked",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.convolution.weight",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.conv.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.weight",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.bias",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.bias",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_mean",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_mean",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_var",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_var",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.num_batches_tracked",
+ f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.num_batches_tracked",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.weight",
+ f"stage{idx}.blocks.{cnt}.attn.proj_q.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.bias",
+ f"stage{idx}.blocks.{cnt}.attn.proj_q.bias",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.weight",
+ f"stage{idx}.blocks.{cnt}.attn.proj_k.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.bias",
+ f"stage{idx}.blocks.{cnt}.attn.proj_k.bias",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.weight",
+ f"stage{idx}.blocks.{cnt}.attn.proj_v.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.bias",
+ f"stage{idx}.blocks.{cnt}.attn.proj_v.bias",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.weight",
+ f"stage{idx}.blocks.{cnt}.attn.proj.weight",
+ )
+ )
+ attention_weights.append(
+ (
+ f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.bias",
+ f"stage{idx}.blocks.{cnt}.attn.proj.bias",
+ )
+ )
+ attention_weights.append(
+ (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc1.weight")
+ )
+ attention_weights.append(
+ (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc1.bias")
+ )
+ attention_weights.append(
+ (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc2.weight")
+ )
+ attention_weights.append(
+ (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc2.bias")
+ )
+ attention_weights.append(
+ (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.weight", f"stage{idx}.blocks.{cnt}.norm1.weight")
+ )
+ attention_weights.append(
+ (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.bias", f"stage{idx}.blocks.{cnt}.norm1.bias")
+ )
+ attention_weights.append(
+ (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.weight", f"stage{idx}.blocks.{cnt}.norm2.weight")
+ )
+ attention_weights.append(
+ (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.bias", f"stage{idx}.blocks.{cnt}.norm2.bias")
+ )
+ return attention_weights
+
+
+def cls_token(idx):
+ """
+ Function helps in renaming cls_token weights
+ """
+ token = []
+ token.append((f"cvt.encoder.stages.{idx}.cls_token", "stage2.cls_token"))
+ return token
+
+
+def final():
+ """
+ Function helps in renaming final classification layer
+ """
+ head = []
+ head.append(("layernorm.weight", "norm.weight"))
+ head.append(("layernorm.bias", "norm.bias"))
+ head.append(("classifier.weight", "head.weight"))
+ head.append(("classifier.bias", "head.bias"))
+ return head
+
+
+def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
+ """
+ Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint
+ """
+ img_labels_file = "imagenet-1k-id2label.json"
+ num_labels = 1000
+
+ repo_id = "huggingface/label-files"
+ num_labels = num_labels
+ id2label = json.loads(Path(hf_hub_download(repo_id, img_labels_file, repo_type="dataset")).read_text())
+ id2label = {int(k): v for k, v in id2label.items()}
+
+ id2label = id2label
+ label2id = {v: k for k, v in id2label.items()}
+
+ config = config = CvtConfig(num_labels=num_labels, id2label=id2label, label2id=label2id)
+
+ # For depth size 13 (13 = 1+2+10)
+ if cvt_model.rsplit("/", 1)[-1][4:6] == "13":
+ config.depth = [1, 2, 10]
+
+ # For depth size 21 (21 = 1+4+16)
+ elif cvt_model.rsplit("/", 1)[-1][4:6] == "21":
+ config.depth = [1, 4, 16]
+
+ # For wide cvt (similar to wide-resnet) depth size 24 (w24 = 2 + 2 20)
+ else:
+ config.depth = [2, 2, 20]
+ config.num_heads = [3, 12, 16]
+ config.embed_dim = [192, 768, 1024]
+
+ model = CvtForImageClassification(config)
+ image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
+ image_processor.size["shortest_edge"] = image_size
+ original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu"), weights_only=True)
+
+ huggingface_weights = OrderedDict()
+ list_of_state_dict = []
+
+ for idx in range(len(config.depth)):
+ if config.cls_token[idx]:
+ list_of_state_dict = list_of_state_dict + cls_token(idx)
+ list_of_state_dict = list_of_state_dict + embeddings(idx)
+ for cnt in range(config.depth[idx]):
+ list_of_state_dict = list_of_state_dict + attention(idx, cnt)
+
+ list_of_state_dict = list_of_state_dict + final()
+ for gg in list_of_state_dict:
+ print(gg)
+ for i in range(len(list_of_state_dict)):
+ huggingface_weights[list_of_state_dict[i][0]] = original_weights[list_of_state_dict[i][1]]
+
+ model.load_state_dict(huggingface_weights)
+ model.save_pretrained(pytorch_dump_folder)
+ image_processor.save_pretrained(pytorch_dump_folder)
+
+
+# Download the weights from zoo: https://1drv.ms/u/s!AhIXJn_J-blW9RzF3rMW7SsLHa8h?e=blQ0Al
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--cvt_model",
+ default="cvt-w24",
+ type=str,
+ help="Name of the cvt model you'd like to convert.",
+ )
+ parser.add_argument(
+ "--image_size",
+ default=384,
+ type=int,
+ help="Input Image Size",
+ )
+ parser.add_argument(
+ "--cvt_file_name",
+ default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth",
+ type=str,
+ help="Input Image Size",
+ )
+ parser.add_argument(
+ "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+ )
+
+ args = parser.parse_args()
+ convert_cvt_checkpoint(args.cvt_model, args.image_size, args.cvt_file_name, args.pytorch_dump_folder_path)
diff --git a/docs/transformers/build/lib/transformers/models/cvt/modeling_cvt.py b/docs/transformers/build/lib/transformers/models/cvt/modeling_cvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..0088b8b440dcc3e3e82111bf3752e84aa47b413f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/cvt/modeling_cvt.py
@@ -0,0 +1,727 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CvT model."""
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import logging
+from .configuration_cvt import CvtConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "CvtConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "microsoft/cvt-13"
+_EXPECTED_OUTPUT_SHAPE = [1, 384, 14, 14]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "microsoft/cvt-13"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+@dataclass
+class BaseModelOutputWithCLSToken(ModelOutput):
+ """
+ Base class for model's outputs, with potential hidden states and attentions.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
+ Classification token at the output of the last layer of the model.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+ plus the initial embedding outputs.
+ """
+
+ last_hidden_state: Optional[torch.FloatTensor] = None
+ cls_token_value: Optional[torch.FloatTensor] = None
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+ """
+ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+ Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+ however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+ layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+ argument.
+ """
+ if drop_prob == 0.0 or not training:
+ return input
+ keep_prob = 1 - drop_prob
+ shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+ random_tensor.floor_() # binarize
+ output = input.div(keep_prob) * random_tensor
+ return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class CvtDropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob: Optional[float] = None) -> None:
+ super().__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ return drop_path(hidden_states, self.drop_prob, self.training)
+
+ def extra_repr(self) -> str:
+ return "p={}".format(self.drop_prob)
+
+
+class CvtEmbeddings(nn.Module):
+ """
+ Construct the CvT embeddings.
+ """
+
+ def __init__(self, patch_size, num_channels, embed_dim, stride, padding, dropout_rate):
+ super().__init__()
+ self.convolution_embeddings = CvtConvEmbeddings(
+ patch_size=patch_size, num_channels=num_channels, embed_dim=embed_dim, stride=stride, padding=padding
+ )
+ self.dropout = nn.Dropout(dropout_rate)
+
+ def forward(self, pixel_values):
+ hidden_state = self.convolution_embeddings(pixel_values)
+ hidden_state = self.dropout(hidden_state)
+ return hidden_state
+
+
+class CvtConvEmbeddings(nn.Module):
+ """
+ Image to Conv Embedding.
+ """
+
+ def __init__(self, patch_size, num_channels, embed_dim, stride, padding):
+ super().__init__()
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ self.patch_size = patch_size
+ self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
+ self.normalization = nn.LayerNorm(embed_dim)
+
+ def forward(self, pixel_values):
+ pixel_values = self.projection(pixel_values)
+ batch_size, num_channels, height, width = pixel_values.shape
+ hidden_size = height * width
+ # rearrange "b c h w -> b (h w) c"
+ pixel_values = pixel_values.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
+ if self.normalization:
+ pixel_values = self.normalization(pixel_values)
+ # rearrange "b (h w) c" -> b c h w"
+ pixel_values = pixel_values.permute(0, 2, 1).view(batch_size, num_channels, height, width)
+ return pixel_values
+
+
+class CvtSelfAttentionConvProjection(nn.Module):
+ def __init__(self, embed_dim, kernel_size, padding, stride):
+ super().__init__()
+ self.convolution = nn.Conv2d(
+ embed_dim,
+ embed_dim,
+ kernel_size=kernel_size,
+ padding=padding,
+ stride=stride,
+ bias=False,
+ groups=embed_dim,
+ )
+ self.normalization = nn.BatchNorm2d(embed_dim)
+
+ def forward(self, hidden_state):
+ hidden_state = self.convolution(hidden_state)
+ hidden_state = self.normalization(hidden_state)
+ return hidden_state
+
+
+class CvtSelfAttentionLinearProjection(nn.Module):
+ def forward(self, hidden_state):
+ batch_size, num_channels, height, width = hidden_state.shape
+ hidden_size = height * width
+ # rearrange " b c h w -> b (h w) c"
+ hidden_state = hidden_state.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
+ return hidden_state
+
+
+class CvtSelfAttentionProjection(nn.Module):
+ def __init__(self, embed_dim, kernel_size, padding, stride, projection_method="dw_bn"):
+ super().__init__()
+ if projection_method == "dw_bn":
+ self.convolution_projection = CvtSelfAttentionConvProjection(embed_dim, kernel_size, padding, stride)
+ self.linear_projection = CvtSelfAttentionLinearProjection()
+
+ def forward(self, hidden_state):
+ hidden_state = self.convolution_projection(hidden_state)
+ hidden_state = self.linear_projection(hidden_state)
+ return hidden_state
+
+
+class CvtSelfAttention(nn.Module):
+ def __init__(
+ self,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ with_cls_token=True,
+ **kwargs,
+ ):
+ super().__init__()
+ self.scale = embed_dim**-0.5
+ self.with_cls_token = with_cls_token
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+
+ self.convolution_projection_query = CvtSelfAttentionProjection(
+ embed_dim,
+ kernel_size,
+ padding_q,
+ stride_q,
+ projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
+ )
+ self.convolution_projection_key = CvtSelfAttentionProjection(
+ embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
+ )
+ self.convolution_projection_value = CvtSelfAttentionProjection(
+ embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
+ )
+
+ self.projection_query = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+ self.projection_key = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+ self.projection_value = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+
+ self.dropout = nn.Dropout(attention_drop_rate)
+
+ def rearrange_for_multi_head_attention(self, hidden_state):
+ batch_size, hidden_size, _ = hidden_state.shape
+ head_dim = self.embed_dim // self.num_heads
+ # rearrange 'b t (h d) -> b h t d'
+ return hidden_state.view(batch_size, hidden_size, self.num_heads, head_dim).permute(0, 2, 1, 3)
+
+ def forward(self, hidden_state, height, width):
+ if self.with_cls_token:
+ cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
+ batch_size, hidden_size, num_channels = hidden_state.shape
+ # rearrange "b (h w) c -> b c h w"
+ hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)
+
+ key = self.convolution_projection_key(hidden_state)
+ query = self.convolution_projection_query(hidden_state)
+ value = self.convolution_projection_value(hidden_state)
+
+ if self.with_cls_token:
+ query = torch.cat((cls_token, query), dim=1)
+ key = torch.cat((cls_token, key), dim=1)
+ value = torch.cat((cls_token, value), dim=1)
+
+ head_dim = self.embed_dim // self.num_heads
+
+ query = self.rearrange_for_multi_head_attention(self.projection_query(query))
+ key = self.rearrange_for_multi_head_attention(self.projection_key(key))
+ value = self.rearrange_for_multi_head_attention(self.projection_value(value))
+
+ attention_score = torch.einsum("bhlk,bhtk->bhlt", [query, key]) * self.scale
+ attention_probs = torch.nn.functional.softmax(attention_score, dim=-1)
+ attention_probs = self.dropout(attention_probs)
+
+ context = torch.einsum("bhlt,bhtv->bhlv", [attention_probs, value])
+ # rearrange"b h t d -> b t (h d)"
+ _, _, hidden_size, _ = context.shape
+ context = context.permute(0, 2, 1, 3).contiguous().view(batch_size, hidden_size, self.num_heads * head_dim)
+ return context
+
+
+class CvtSelfOutput(nn.Module):
+ """
+ The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
+ layernorm applied before each block.
+ """
+
+ def __init__(self, embed_dim, drop_rate):
+ super().__init__()
+ self.dense = nn.Linear(embed_dim, embed_dim)
+ self.dropout = nn.Dropout(drop_rate)
+
+ def forward(self, hidden_state, input_tensor):
+ hidden_state = self.dense(hidden_state)
+ hidden_state = self.dropout(hidden_state)
+ return hidden_state
+
+
+class CvtAttention(nn.Module):
+ def __init__(
+ self,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ drop_rate,
+ with_cls_token=True,
+ ):
+ super().__init__()
+ self.attention = CvtSelfAttention(
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ with_cls_token,
+ )
+ self.output = CvtSelfOutput(embed_dim, drop_rate)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+ )
+
+ # Prune linear layers
+ self.attention.query = prune_linear_layer(self.attention.query, index)
+ self.attention.key = prune_linear_layer(self.attention.key, index)
+ self.attention.value = prune_linear_layer(self.attention.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+ self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(self, hidden_state, height, width):
+ self_output = self.attention(hidden_state, height, width)
+ attention_output = self.output(self_output, hidden_state)
+ return attention_output
+
+
+class CvtIntermediate(nn.Module):
+ def __init__(self, embed_dim, mlp_ratio):
+ super().__init__()
+ self.dense = nn.Linear(embed_dim, int(embed_dim * mlp_ratio))
+ self.activation = nn.GELU()
+
+ def forward(self, hidden_state):
+ hidden_state = self.dense(hidden_state)
+ hidden_state = self.activation(hidden_state)
+ return hidden_state
+
+
+class CvtOutput(nn.Module):
+ def __init__(self, embed_dim, mlp_ratio, drop_rate):
+ super().__init__()
+ self.dense = nn.Linear(int(embed_dim * mlp_ratio), embed_dim)
+ self.dropout = nn.Dropout(drop_rate)
+
+ def forward(self, hidden_state, input_tensor):
+ hidden_state = self.dense(hidden_state)
+ hidden_state = self.dropout(hidden_state)
+ hidden_state = hidden_state + input_tensor
+ return hidden_state
+
+
+class CvtLayer(nn.Module):
+ """
+ CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
+ """
+
+ def __init__(
+ self,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ drop_rate,
+ mlp_ratio,
+ drop_path_rate,
+ with_cls_token=True,
+ ):
+ super().__init__()
+ self.attention = CvtAttention(
+ num_heads,
+ embed_dim,
+ kernel_size,
+ padding_q,
+ padding_kv,
+ stride_q,
+ stride_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ drop_rate,
+ with_cls_token,
+ )
+
+ self.intermediate = CvtIntermediate(embed_dim, mlp_ratio)
+ self.output = CvtOutput(embed_dim, mlp_ratio, drop_rate)
+ self.drop_path = CvtDropPath(drop_prob=drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+ self.layernorm_before = nn.LayerNorm(embed_dim)
+ self.layernorm_after = nn.LayerNorm(embed_dim)
+
+ def forward(self, hidden_state, height, width):
+ self_attention_output = self.attention(
+ self.layernorm_before(hidden_state), # in Cvt, layernorm is applied before self-attention
+ height,
+ width,
+ )
+ attention_output = self_attention_output
+ attention_output = self.drop_path(attention_output)
+
+ # first residual connection
+ hidden_state = attention_output + hidden_state
+
+ # in Cvt, layernorm is also applied after self-attention
+ layer_output = self.layernorm_after(hidden_state)
+ layer_output = self.intermediate(layer_output)
+
+ # second residual connection is done here
+ layer_output = self.output(layer_output, hidden_state)
+ layer_output = self.drop_path(layer_output)
+ return layer_output
+
+
+class CvtStage(nn.Module):
+ def __init__(self, config, stage):
+ super().__init__()
+ self.config = config
+ self.stage = stage
+ if self.config.cls_token[self.stage]:
+ self.cls_token = nn.Parameter(torch.randn(1, 1, self.config.embed_dim[-1]))
+
+ self.embedding = CvtEmbeddings(
+ patch_size=config.patch_sizes[self.stage],
+ stride=config.patch_stride[self.stage],
+ num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
+ embed_dim=config.embed_dim[self.stage],
+ padding=config.patch_padding[self.stage],
+ dropout_rate=config.drop_rate[self.stage],
+ )
+
+ drop_path_rates = [
+ x.item() for x in torch.linspace(0, config.drop_path_rate[self.stage], config.depth[stage], device="cpu")
+ ]
+
+ self.layers = nn.Sequential(
+ *[
+ CvtLayer(
+ num_heads=config.num_heads[self.stage],
+ embed_dim=config.embed_dim[self.stage],
+ kernel_size=config.kernel_qkv[self.stage],
+ padding_q=config.padding_q[self.stage],
+ padding_kv=config.padding_kv[self.stage],
+ stride_kv=config.stride_kv[self.stage],
+ stride_q=config.stride_q[self.stage],
+ qkv_projection_method=config.qkv_projection_method[self.stage],
+ qkv_bias=config.qkv_bias[self.stage],
+ attention_drop_rate=config.attention_drop_rate[self.stage],
+ drop_rate=config.drop_rate[self.stage],
+ drop_path_rate=drop_path_rates[self.stage],
+ mlp_ratio=config.mlp_ratio[self.stage],
+ with_cls_token=config.cls_token[self.stage],
+ )
+ for _ in range(config.depth[self.stage])
+ ]
+ )
+
+ def forward(self, hidden_state):
+ cls_token = None
+ hidden_state = self.embedding(hidden_state)
+ batch_size, num_channels, height, width = hidden_state.shape
+ # rearrange b c h w -> b (h w) c"
+ hidden_state = hidden_state.view(batch_size, num_channels, height * width).permute(0, 2, 1)
+ if self.config.cls_token[self.stage]:
+ cls_token = self.cls_token.expand(batch_size, -1, -1)
+ hidden_state = torch.cat((cls_token, hidden_state), dim=1)
+
+ for layer in self.layers:
+ layer_outputs = layer(hidden_state, height, width)
+ hidden_state = layer_outputs
+
+ if self.config.cls_token[self.stage]:
+ cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
+ hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)
+ return hidden_state, cls_token
+
+
+class CvtEncoder(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.stages = nn.ModuleList([])
+ for stage_idx in range(len(config.depth)):
+ self.stages.append(CvtStage(config, stage_idx))
+
+ def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
+ all_hidden_states = () if output_hidden_states else None
+ hidden_state = pixel_values
+
+ cls_token = None
+ for _, (stage_module) in enumerate(self.stages):
+ hidden_state, cls_token = stage_module(hidden_state)
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_state,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
+
+ return BaseModelOutputWithCLSToken(
+ last_hidden_state=hidden_state,
+ cls_token_value=cls_token,
+ hidden_states=all_hidden_states,
+ )
+
+
+class CvtPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = CvtConfig
+ base_model_prefix = "cvt"
+ main_input_name = "pixel_values"
+ _no_split_modules = ["CvtLayer"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
+ module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, CvtStage):
+ if self.config.cls_token[module.stage]:
+ module.cls_token.data = nn.init.trunc_normal_(
+ module.cls_token.data, mean=0.0, std=self.config.initializer_range
+ )
+
+
+CVT_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+ as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CVT_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
+ for details.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.",
+ CVT_START_DOCSTRING,
+)
+class CvtModel(CvtPreTrainedModel):
+ def __init__(self, config, add_pooling_layer=True):
+ super().__init__(config)
+ self.config = config
+ self.encoder = CvtEncoder(config)
+ self.post_init()
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutputWithCLSToken,
+ config_class=_CONFIG_FOR_DOC,
+ modality="vision",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithCLSToken]:
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ encoder_outputs = self.encoder(
+ pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ sequence_output = encoder_outputs[0]
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[1:]
+
+ return BaseModelOutputWithCLSToken(
+ last_hidden_state=sequence_output,
+ cls_token_value=encoder_outputs.cls_token_value,
+ hidden_states=encoder_outputs.hidden_states,
+ )
+
+
+@add_start_docstrings(
+ """
+ Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+ the [CLS] token) e.g. for ImageNet.
+ """,
+ CVT_START_DOCSTRING,
+)
+class CvtForImageClassification(CvtPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.cvt = CvtModel(config, add_pooling_layer=False)
+ self.layernorm = nn.LayerNorm(config.embed_dim[-1])
+ # Classifier head
+ self.classifier = (
+ nn.Linear(config.embed_dim[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+ )
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
+ output_type=ImageClassifierOutputWithNoAttention,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ outputs = self.cvt(
+ pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+ cls_token = outputs[1]
+ if self.config.cls_token[-1]:
+ sequence_output = self.layernorm(cls_token)
+ else:
+ batch_size, num_channels, height, width = sequence_output.shape
+ # rearrange "b c h w -> b (h w) c"
+ sequence_output = sequence_output.view(batch_size, num_channels, height * width).permute(0, 2, 1)
+ sequence_output = self.layernorm(sequence_output)
+
+ sequence_output_mean = sequence_output.mean(dim=1)
+ logits = self.classifier(sequence_output_mean)
+
+ loss = None
+ if labels is not None:
+ if self.config.problem_type is None:
+ if self.config.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.config.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["CvtForImageClassification", "CvtModel", "CvtPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/cvt/modeling_tf_cvt.py b/docs/transformers/build/lib/transformers/models/cvt/modeling_tf_cvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..74202d880623d983aa7fadb8f6972fa385395edf
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/cvt/modeling_tf_cvt.py
@@ -0,0 +1,1096 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 Cvt model."""
+
+from __future__ import annotations
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import TFImageClassifierOutputWithNoAttention
+from ...modeling_tf_utils import (
+ TFModelInputType,
+ TFPreTrainedModel,
+ TFSequenceClassificationLoss,
+ get_initializer,
+ keras,
+ keras_serializable,
+ unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_cvt import CvtConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "CvtConfig"
+
+
+@dataclass
+class TFBaseModelOutputWithCLSToken(ModelOutput):
+ """
+ Base class for model's outputs.
+
+ Args:
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
+ Classification token at the output of the last layer of the model.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+ `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+ the initial embedding outputs.
+ """
+
+ last_hidden_state: Optional[tf.Tensor] = None
+ cls_token_value: Optional[tf.Tensor] = None
+ hidden_states: Tuple[tf.Tensor, ...] | None = None
+
+
+class TFCvtDropPath(keras.layers.Layer):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ References:
+ (1) github.com:rwightman/pytorch-image-models
+ """
+
+ def __init__(self, drop_prob: float, **kwargs):
+ super().__init__(**kwargs)
+ self.drop_prob = drop_prob
+
+ def call(self, x: tf.Tensor, training=None):
+ if self.drop_prob == 0.0 or not training:
+ return x
+ keep_prob = 1 - self.drop_prob
+ shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+ random_tensor = keep_prob + tf.random.uniform(shape, 0, 1, dtype=self.compute_dtype)
+ random_tensor = tf.floor(random_tensor)
+ return (x / keep_prob) * random_tensor
+
+
+class TFCvtEmbeddings(keras.layers.Layer):
+ """Construct the Convolutional Token Embeddings."""
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ patch_size: int,
+ num_channels: int,
+ embed_dim: int,
+ stride: int,
+ padding: int,
+ dropout_rate: float,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.convolution_embeddings = TFCvtConvEmbeddings(
+ config,
+ patch_size=patch_size,
+ num_channels=num_channels,
+ embed_dim=embed_dim,
+ stride=stride,
+ padding=padding,
+ name="convolution_embeddings",
+ )
+ self.dropout = keras.layers.Dropout(dropout_rate)
+
+ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.convolution_embeddings(pixel_values)
+ hidden_state = self.dropout(hidden_state, training=training)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "convolution_embeddings", None) is not None:
+ with tf.name_scope(self.convolution_embeddings.name):
+ self.convolution_embeddings.build(None)
+
+
+class TFCvtConvEmbeddings(keras.layers.Layer):
+ """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ patch_size: int,
+ num_channels: int,
+ embed_dim: int,
+ stride: int,
+ padding: int,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.padding = keras.layers.ZeroPadding2D(padding=padding)
+ self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ self.projection = keras.layers.Conv2D(
+ filters=embed_dim,
+ kernel_size=patch_size,
+ strides=stride,
+ padding="valid",
+ data_format="channels_last",
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="projection",
+ )
+ # Using the same default epsilon as PyTorch
+ self.normalization = keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
+ self.num_channels = num_channels
+ self.embed_dim = embed_dim
+
+ def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
+ if isinstance(pixel_values, dict):
+ pixel_values = pixel_values["pixel_values"]
+
+ pixel_values = self.projection(self.padding(pixel_values))
+
+ # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
+ batch_size, height, width, num_channels = shape_list(pixel_values)
+ hidden_size = height * width
+ pixel_values = tf.reshape(pixel_values, shape=(batch_size, hidden_size, num_channels))
+ pixel_values = self.normalization(pixel_values)
+
+ # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
+ pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels))
+ return pixel_values
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "projection", None) is not None:
+ with tf.name_scope(self.projection.name):
+ self.projection.build([None, None, None, self.num_channels])
+ if getattr(self, "normalization", None) is not None:
+ with tf.name_scope(self.normalization.name):
+ self.normalization.build([None, None, self.embed_dim])
+
+
+class TFCvtSelfAttentionConvProjection(keras.layers.Layer):
+ """Convolutional projection layer."""
+
+ def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs):
+ super().__init__(**kwargs)
+ self.padding = keras.layers.ZeroPadding2D(padding=padding)
+ self.convolution = keras.layers.Conv2D(
+ filters=embed_dim,
+ kernel_size=kernel_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ padding="valid",
+ strides=stride,
+ use_bias=False,
+ name="convolution",
+ groups=embed_dim,
+ )
+ # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum)
+ self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
+ self.embed_dim = embed_dim
+
+ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.convolution(self.padding(hidden_state))
+ hidden_state = self.normalization(hidden_state, training=training)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "convolution", None) is not None:
+ with tf.name_scope(self.convolution.name):
+ self.convolution.build([None, None, None, self.embed_dim])
+ if getattr(self, "normalization", None) is not None:
+ with tf.name_scope(self.normalization.name):
+ self.normalization.build([None, None, None, self.embed_dim])
+
+
+class TFCvtSelfAttentionLinearProjection(keras.layers.Layer):
+ """Linear projection layer used to flatten tokens into 1D."""
+
+ def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
+ # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
+ batch_size, height, width, num_channels = shape_list(hidden_state)
+ hidden_size = height * width
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
+ return hidden_state
+
+
+class TFCvtSelfAttentionProjection(keras.layers.Layer):
+ """Convolutional Projection for Attention."""
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ embed_dim: int,
+ kernel_size: int,
+ stride: int,
+ padding: int,
+ projection_method: str = "dw_bn",
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ if projection_method == "dw_bn":
+ self.convolution_projection = TFCvtSelfAttentionConvProjection(
+ config, embed_dim, kernel_size, stride, padding, name="convolution_projection"
+ )
+ self.linear_projection = TFCvtSelfAttentionLinearProjection()
+
+ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.convolution_projection(hidden_state, training=training)
+ hidden_state = self.linear_projection(hidden_state)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "convolution_projection", None) is not None:
+ with tf.name_scope(self.convolution_projection.name):
+ self.convolution_projection.build(None)
+
+
+class TFCvtSelfAttention(keras.layers.Layer):
+ """
+ Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
+ query, key, and value embeddings.
+ """
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ num_heads: int,
+ embed_dim: int,
+ kernel_size: int,
+ stride_q: int,
+ stride_kv: int,
+ padding_q: int,
+ padding_kv: int,
+ qkv_projection_method: str,
+ qkv_bias: bool,
+ attention_drop_rate: float,
+ with_cls_token: bool = True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.scale = embed_dim**-0.5
+ self.with_cls_token = with_cls_token
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+
+ self.convolution_projection_query = TFCvtSelfAttentionProjection(
+ config,
+ embed_dim,
+ kernel_size,
+ stride_q,
+ padding_q,
+ projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
+ name="convolution_projection_query",
+ )
+ self.convolution_projection_key = TFCvtSelfAttentionProjection(
+ config,
+ embed_dim,
+ kernel_size,
+ stride_kv,
+ padding_kv,
+ projection_method=qkv_projection_method,
+ name="convolution_projection_key",
+ )
+ self.convolution_projection_value = TFCvtSelfAttentionProjection(
+ config,
+ embed_dim,
+ kernel_size,
+ stride_kv,
+ padding_kv,
+ projection_method=qkv_projection_method,
+ name="convolution_projection_value",
+ )
+
+ self.projection_query = keras.layers.Dense(
+ units=embed_dim,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=qkv_bias,
+ bias_initializer="zeros",
+ name="projection_query",
+ )
+ self.projection_key = keras.layers.Dense(
+ units=embed_dim,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=qkv_bias,
+ bias_initializer="zeros",
+ name="projection_key",
+ )
+ self.projection_value = keras.layers.Dense(
+ units=embed_dim,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=qkv_bias,
+ bias_initializer="zeros",
+ name="projection_value",
+ )
+ self.dropout = keras.layers.Dropout(attention_drop_rate)
+
+ def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor:
+ batch_size, hidden_size, _ = shape_list(hidden_state)
+ head_dim = self.embed_dim // self.num_heads
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, self.num_heads, head_dim))
+ hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3))
+ return hidden_state
+
+ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
+ if self.with_cls_token:
+ cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
+
+ # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
+ batch_size, hidden_size, num_channels = shape_list(hidden_state)
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
+
+ key = self.convolution_projection_key(hidden_state, training=training)
+ query = self.convolution_projection_query(hidden_state, training=training)
+ value = self.convolution_projection_value(hidden_state, training=training)
+
+ if self.with_cls_token:
+ query = tf.concat((cls_token, query), axis=1)
+ key = tf.concat((cls_token, key), axis=1)
+ value = tf.concat((cls_token, value), axis=1)
+
+ head_dim = self.embed_dim // self.num_heads
+
+ query = self.rearrange_for_multi_head_attention(self.projection_query(query))
+ key = self.rearrange_for_multi_head_attention(self.projection_key(key))
+ value = self.rearrange_for_multi_head_attention(self.projection_value(value))
+
+ attention_score = tf.matmul(query, key, transpose_b=True) * self.scale
+ attention_probs = stable_softmax(logits=attention_score, axis=-1)
+ attention_probs = self.dropout(attention_probs, training=training)
+
+ context = tf.matmul(attention_probs, value)
+ # "batch_size, num_heads, hidden_size, head_dim -> batch_size, hidden_size, (num_heads*head_dim)"
+ _, _, hidden_size, _ = shape_list(context)
+ context = tf.transpose(context, perm=(0, 2, 1, 3))
+ context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim))
+ return context
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "convolution_projection_query", None) is not None:
+ with tf.name_scope(self.convolution_projection_query.name):
+ self.convolution_projection_query.build(None)
+ if getattr(self, "convolution_projection_key", None) is not None:
+ with tf.name_scope(self.convolution_projection_key.name):
+ self.convolution_projection_key.build(None)
+ if getattr(self, "convolution_projection_value", None) is not None:
+ with tf.name_scope(self.convolution_projection_value.name):
+ self.convolution_projection_value.build(None)
+ if getattr(self, "projection_query", None) is not None:
+ with tf.name_scope(self.projection_query.name):
+ self.projection_query.build([None, None, self.embed_dim])
+ if getattr(self, "projection_key", None) is not None:
+ with tf.name_scope(self.projection_key.name):
+ self.projection_key.build([None, None, self.embed_dim])
+ if getattr(self, "projection_value", None) is not None:
+ with tf.name_scope(self.projection_value.name):
+ self.projection_value.build([None, None, self.embed_dim])
+
+
+class TFCvtSelfOutput(keras.layers.Layer):
+ """Output of the Attention layer ."""
+
+ def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(
+ units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.dropout = keras.layers.Dropout(drop_rate)
+ self.embed_dim = embed_dim
+
+ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.dense(inputs=hidden_state)
+ hidden_state = self.dropout(inputs=hidden_state, training=training)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.embed_dim])
+
+
+class TFCvtAttention(keras.layers.Layer):
+ """Attention layer. First chunk of the convolutional transformer block."""
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ num_heads: int,
+ embed_dim: int,
+ kernel_size: int,
+ stride_q: int,
+ stride_kv: int,
+ padding_q: int,
+ padding_kv: int,
+ qkv_projection_method: str,
+ qkv_bias: bool,
+ attention_drop_rate: float,
+ drop_rate: float,
+ with_cls_token: bool = True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.attention = TFCvtSelfAttention(
+ config,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ stride_q,
+ stride_kv,
+ padding_q,
+ padding_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ with_cls_token,
+ name="attention",
+ )
+ self.dense_output = TFCvtSelfOutput(config, embed_dim, drop_rate, name="output")
+
+ def prune_heads(self, heads):
+ raise NotImplementedError
+
+ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False):
+ self_output = self.attention(hidden_state, height, width, training=training)
+ attention_output = self.dense_output(self_output, training=training)
+ return attention_output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "dense_output", None) is not None:
+ with tf.name_scope(self.dense_output.name):
+ self.dense_output.build(None)
+
+
+class TFCvtIntermediate(keras.layers.Layer):
+ """Intermediate dense layer. Second chunk of the convolutional transformer block."""
+
+ def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(
+ units=int(embed_dim * mlp_ratio),
+ kernel_initializer=get_initializer(config.initializer_range),
+ activation="gelu",
+ name="dense",
+ )
+ self.embed_dim = embed_dim
+
+ def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
+ hidden_state = self.dense(hidden_state)
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.embed_dim])
+
+
+class TFCvtOutput(keras.layers.Layer):
+ """
+ Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
+ """
+
+ def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(
+ units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.dropout = keras.layers.Dropout(drop_rate)
+ self.embed_dim = embed_dim
+ self.mlp_ratio = mlp_ratio
+
+ def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_state = self.dense(inputs=hidden_state)
+ hidden_state = self.dropout(inputs=hidden_state, training=training)
+ hidden_state = hidden_state + input_tensor
+ return hidden_state
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)])
+
+
+class TFCvtLayer(keras.layers.Layer):
+ """
+ Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
+ consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
+ `Block` class in the original implementation.
+ """
+
+ def __init__(
+ self,
+ config: CvtConfig,
+ num_heads: int,
+ embed_dim: int,
+ kernel_size: int,
+ stride_q: int,
+ stride_kv: int,
+ padding_q: int,
+ padding_kv: int,
+ qkv_projection_method: str,
+ qkv_bias: bool,
+ attention_drop_rate: float,
+ drop_rate: float,
+ mlp_ratio: float,
+ drop_path_rate: float,
+ with_cls_token: bool = True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.attention = TFCvtAttention(
+ config,
+ num_heads,
+ embed_dim,
+ kernel_size,
+ stride_q,
+ stride_kv,
+ padding_q,
+ padding_kv,
+ qkv_projection_method,
+ qkv_bias,
+ attention_drop_rate,
+ drop_rate,
+ with_cls_token,
+ name="attention",
+ )
+ self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate")
+ self.dense_output = TFCvtOutput(config, embed_dim, mlp_ratio, drop_rate, name="output")
+ # Using `layers.Activation` instead of `tf.identity` to better control `training` behaviour.
+ self.drop_path = (
+ TFCvtDropPath(drop_path_rate, name="drop_path")
+ if drop_path_rate > 0.0
+ else keras.layers.Activation("linear", name="drop_path")
+ )
+ # Using the same default epsilon as PyTorch
+ self.layernorm_before = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
+ self.layernorm_after = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
+ self.embed_dim = embed_dim
+
+ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
+ # in Cvt, layernorm is applied before self-attention
+ attention_output = self.attention(self.layernorm_before(hidden_state), height, width, training=training)
+ attention_output = self.drop_path(attention_output, training=training)
+
+ # first residual connection
+ hidden_state = attention_output + hidden_state
+
+ # in Cvt, layernorm is also applied after self-attention
+ layer_output = self.layernorm_after(hidden_state)
+ layer_output = self.intermediate(layer_output)
+
+ # second residual connection is done here
+ layer_output = self.dense_output(layer_output, hidden_state)
+ layer_output = self.drop_path(layer_output, training=training)
+ return layer_output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "intermediate", None) is not None:
+ with tf.name_scope(self.intermediate.name):
+ self.intermediate.build(None)
+ if getattr(self, "dense_output", None) is not None:
+ with tf.name_scope(self.dense_output.name):
+ self.dense_output.build(None)
+ if getattr(self, "drop_path", None) is not None:
+ with tf.name_scope(self.drop_path.name):
+ self.drop_path.build(None)
+ if getattr(self, "layernorm_before", None) is not None:
+ with tf.name_scope(self.layernorm_before.name):
+ self.layernorm_before.build([None, None, self.embed_dim])
+ if getattr(self, "layernorm_after", None) is not None:
+ with tf.name_scope(self.layernorm_after.name):
+ self.layernorm_after.build([None, None, self.embed_dim])
+
+
+class TFCvtStage(keras.layers.Layer):
+ """
+ Cvt stage (encoder block). Each stage has 2 parts :
+ - (1) A Convolutional Token Embedding layer
+ - (2) A Convolutional Transformer Block (layer).
+ The classification token is added only in the last stage.
+
+ Args:
+ config ([`CvtConfig`]): Model configuration class.
+ stage (`int`): Stage number.
+ """
+
+ def __init__(self, config: CvtConfig, stage: int, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.stage = stage
+ if self.config.cls_token[self.stage]:
+ self.cls_token = self.add_weight(
+ shape=(1, 1, self.config.embed_dim[-1]),
+ initializer=get_initializer(self.config.initializer_range),
+ trainable=True,
+ name="cvt.encoder.stages.2.cls_token",
+ )
+
+ self.embedding = TFCvtEmbeddings(
+ self.config,
+ patch_size=config.patch_sizes[self.stage],
+ num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
+ stride=config.patch_stride[self.stage],
+ embed_dim=config.embed_dim[self.stage],
+ padding=config.patch_padding[self.stage],
+ dropout_rate=config.drop_rate[self.stage],
+ name="embedding",
+ )
+
+ drop_path_rates = tf.linspace(0.0, config.drop_path_rate[self.stage], config.depth[stage])
+ drop_path_rates = [x.numpy().item() for x in drop_path_rates]
+ self.layers = [
+ TFCvtLayer(
+ config,
+ num_heads=config.num_heads[self.stage],
+ embed_dim=config.embed_dim[self.stage],
+ kernel_size=config.kernel_qkv[self.stage],
+ stride_q=config.stride_q[self.stage],
+ stride_kv=config.stride_kv[self.stage],
+ padding_q=config.padding_q[self.stage],
+ padding_kv=config.padding_kv[self.stage],
+ qkv_projection_method=config.qkv_projection_method[self.stage],
+ qkv_bias=config.qkv_bias[self.stage],
+ attention_drop_rate=config.attention_drop_rate[self.stage],
+ drop_rate=config.drop_rate[self.stage],
+ mlp_ratio=config.mlp_ratio[self.stage],
+ drop_path_rate=drop_path_rates[self.stage],
+ with_cls_token=config.cls_token[self.stage],
+ name=f"layers.{j}",
+ )
+ for j in range(config.depth[self.stage])
+ ]
+
+ def call(self, hidden_state: tf.Tensor, training: bool = False):
+ cls_token = None
+ hidden_state = self.embedding(hidden_state, training)
+
+ # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
+ batch_size, height, width, num_channels = shape_list(hidden_state)
+ hidden_size = height * width
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
+
+ if self.config.cls_token[self.stage]:
+ cls_token = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
+ hidden_state = tf.concat((cls_token, hidden_state), axis=1)
+
+ for layer in self.layers:
+ layer_outputs = layer(hidden_state, height, width, training=training)
+ hidden_state = layer_outputs
+
+ if self.config.cls_token[self.stage]:
+ cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
+
+ # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
+ hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
+ return hidden_state, cls_token
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "embedding", None) is not None:
+ with tf.name_scope(self.embedding.name):
+ self.embedding.build(None)
+ if getattr(self, "layers", None) is not None:
+ for layer in self.layers:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+class TFCvtEncoder(keras.layers.Layer):
+ """
+ Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
+ (depth) being 1, 2 and 10.
+
+ Args:
+ config ([`CvtConfig`]): Model configuration class.
+ """
+
+ config_class = CvtConfig
+
+ def __init__(self, config: CvtConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.stages = [
+ TFCvtStage(config, stage_idx, name=f"stages.{stage_idx}") for stage_idx in range(len(config.depth))
+ ]
+
+ def call(
+ self,
+ pixel_values: TFModelInputType,
+ output_hidden_states: Optional[bool] = False,
+ return_dict: Optional[bool] = True,
+ training: Optional[bool] = False,
+ ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
+ all_hidden_states = () if output_hidden_states else None
+ hidden_state = pixel_values
+ # When running on CPU, `keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width)
+ # as input format. So change the input format to (batch_size, height, width, num_channels).
+ hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1))
+
+ cls_token = None
+ for _, (stage_module) in enumerate(self.stages):
+ hidden_state, cls_token = stage_module(hidden_state, training=training)
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_state,)
+
+ # Change back to (batch_size, num_channels, height, width) format to have uniformity in the modules
+ hidden_state = tf.transpose(hidden_state, perm=(0, 3, 1, 2))
+ if output_hidden_states:
+ all_hidden_states = tuple([tf.transpose(hs, perm=(0, 3, 1, 2)) for hs in all_hidden_states])
+
+ if not return_dict:
+ return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
+
+ return TFBaseModelOutputWithCLSToken(
+ last_hidden_state=hidden_state,
+ cls_token_value=cls_token,
+ hidden_states=all_hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "stages", None) is not None:
+ for layer in self.stages:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+@keras_serializable
+class TFCvtMainLayer(keras.layers.Layer):
+ """Construct the Cvt model."""
+
+ config_class = CvtConfig
+
+ def __init__(self, config: CvtConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.encoder = TFCvtEncoder(config, name="encoder")
+
+ @unpack_inputs
+ def call(
+ self,
+ pixel_values: TFModelInputType | None = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ encoder_outputs = self.encoder(
+ pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = encoder_outputs[0]
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[1:]
+
+ return TFBaseModelOutputWithCLSToken(
+ last_hidden_state=sequence_output,
+ cls_token_value=encoder_outputs.cls_token_value,
+ hidden_states=encoder_outputs.hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "encoder", None) is not None:
+ with tf.name_scope(self.encoder.name):
+ self.encoder.build(None)
+
+
+class TFCvtPreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = CvtConfig
+ base_model_prefix = "cvt"
+ main_input_name = "pixel_values"
+
+
+TFCVT_START_DOCSTRING = r"""
+
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+ as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+ behavior.
+
+
+
+ TF 2.0 models accepts two formats as inputs:
+
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
+
+ This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
+ tensors in the first argument of the model call function: `model(inputs)`.
+
+
+
+ Args:
+ config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TFCVT_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
+ for details.
+
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+ used instead.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+ eager mode, in graph mode the value will always be set to True.
+ training (`bool`, *optional*, defaults to `False``):
+ Whether or not to use the model in training mode (some modules like dropout modules have different
+ behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+ "The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.",
+ TFCVT_START_DOCSTRING,
+)
+class TFCvtModel(TFCvtPreTrainedModel):
+ def __init__(self, config: CvtConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.cvt = TFCvtMainLayer(config, name="cvt")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, TFCvtModel
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
+ >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")
+
+ >>> inputs = image_processor(images=image, return_tensors="tf")
+ >>> outputs = model(**inputs)
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ outputs = self.cvt(
+ pixel_values=pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ if not return_dict:
+ return (outputs[0],) + outputs[1:]
+
+ return TFBaseModelOutputWithCLSToken(
+ last_hidden_state=outputs.last_hidden_state,
+ cls_token_value=outputs.cls_token_value,
+ hidden_states=outputs.hidden_states,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "cvt", None) is not None:
+ with tf.name_scope(self.cvt.name):
+ self.cvt.build(None)
+
+
+@add_start_docstrings(
+ """
+ Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+ the [CLS] token) e.g. for ImageNet.
+ """,
+ TFCVT_START_DOCSTRING,
+)
+class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassificationLoss):
+ def __init__(self, config: CvtConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.num_labels = config.num_labels
+ self.cvt = TFCvtMainLayer(config, name="cvt")
+ # Using same default epsilon as in the original implementation.
+ self.layernorm = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm")
+
+ # Classifier head
+ self.classifier = keras.layers.Dense(
+ units=config.num_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ use_bias=True,
+ bias_initializer="zeros",
+ name="classifier",
+ )
+ self.config = config
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ labels: tf.Tensor | None = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, TFCvtForImageClassification
+ >>> import tensorflow as tf
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
+ >>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")
+
+ >>> inputs = image_processor(images=image, return_tensors="tf")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
+ >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
+ ```"""
+
+ outputs = self.cvt(
+ pixel_values,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = outputs[0]
+ cls_token = outputs[1]
+ if self.config.cls_token[-1]:
+ sequence_output = self.layernorm(cls_token)
+ else:
+ # rearrange "batch_size, num_channels, height, width -> batch_size, (height*width), num_channels"
+ batch_size, num_channels, height, width = shape_list(sequence_output)
+ sequence_output = tf.reshape(sequence_output, shape=(batch_size, num_channels, height * width))
+ sequence_output = tf.transpose(sequence_output, perm=(0, 2, 1))
+ sequence_output = self.layernorm(sequence_output)
+
+ sequence_output_mean = tf.reduce_mean(sequence_output, axis=1)
+ logits = self.classifier(sequence_output_mean)
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "cvt", None) is not None:
+ with tf.name_scope(self.cvt.name):
+ self.cvt.build(None)
+ if getattr(self, "layernorm", None) is not None:
+ with tf.name_scope(self.layernorm.name):
+ self.layernorm.build([None, None, self.config.embed_dim[-1]])
+ if getattr(self, "classifier", None) is not None:
+ if hasattr(self.classifier, "name"):
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.config.embed_dim[-1]])
+
+
+__all__ = ["TFCvtForImageClassification", "TFCvtModel", "TFCvtPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/dab_detr/__init__.py b/docs/transformers/build/lib/transformers/models/dab_detr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfa364bd2152049fc92f575e102bf7b934ba4a6c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dab_detr/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_dab_detr import *
+ from .modeling_dab_detr import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/dab_detr/configuration_dab_detr.py b/docs/transformers/build/lib/transformers/models/dab_detr/configuration_dab_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..398e6f26591f0498a6d15889d7ef486ea5d74e28
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dab_detr/configuration_dab_detr.py
@@ -0,0 +1,260 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DAB-DETR model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class DabDetrConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DabDetrModel`]. It is used to instantiate
+ a DAB-DETR model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the DAB-DETR
+ [IDEA-Research/dab_detr-base](https://huggingface.co/IDEA-Research/dab_detr-base) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ use_timm_backbone (`bool`, *optional*, defaults to `True`):
+ Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+ API.
+ backbone_config (`PretrainedConfig` or `dict`, *optional*):
+ The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+ case it will default to `ResNetConfig()`.
+ backbone (`str`, *optional*, defaults to `"resnet50"`):
+ Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+ will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+ is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+ use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+ Whether to use pretrained weights for the backbone.
+ backbone_kwargs (`dict`, *optional*):
+ Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+ e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+ num_queries (`int`, *optional*, defaults to 300):
+ Number of object queries, i.e. detection slots. This is the maximal number of objects
+ [`DabDetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
+ encoder_layers (`int`, *optional*, defaults to 6):
+ Number of encoder layers.
+ encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+ Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+ encoder_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ decoder_layers (`int`, *optional*, defaults to 6):
+ Number of decoder layers.
+ decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+ Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+ decoder_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer decoder.
+ is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+ Indicates whether the transformer model architecture is an encoder-decoder or not.
+ activation_function (`str` or `function`, *optional*, defaults to `"prelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_size (`int`, *optional*, defaults to 256):
+ This parameter is a general dimension parameter, defining dimensions for components such as the encoder layer and projection parameters in the decoder layer, among others.
+ dropout (`float`, *optional*, defaults to 0.1):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ activation_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for activations inside the fully connected layer.
+ init_std (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ init_xavier_std (`float`, *optional*, defaults to 1.0):
+ The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+ auxiliary_loss (`bool`, *optional*, defaults to `False`):
+ Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+ dilation (`bool`, *optional*, defaults to `False`):
+ Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when `use_timm_backbone` = `True`.
+ class_cost (`float`, *optional*, defaults to 2):
+ Relative weight of the classification error in the Hungarian matching cost.
+ bbox_cost (`float`, *optional*, defaults to 5):
+ Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+ giou_cost (`float`, *optional*, defaults to 2):
+ Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+ cls_loss_coefficient (`float`, *optional*, defaults to 2):
+ Relative weight of the classification loss in the object detection loss function.
+ bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+ Relative weight of the L1 bounding box loss in the object detection loss.
+ giou_loss_coefficient (`float`, *optional*, defaults to 2):
+ Relative weight of the generalized IoU loss in the object detection loss.
+ focal_alpha (`float`, *optional*, defaults to 0.25):
+ Alpha parameter in the focal loss.
+ temperature_height (`int`, *optional*, defaults to 20):
+ Temperature parameter to tune the flatness of positional attention (HEIGHT)
+ temperature_width (`int`, *optional*, defaults to 20):
+ Temperature parameter to tune the flatness of positional attention (WIDTH)
+ query_dim (`int`, *optional*, defaults to 4):
+ Query dimension parameter represents the size of the output vector.
+ random_refpoints_xy (`bool`, *optional*, defaults to `False`):
+ Whether to fix the x and y coordinates of the anchor boxes with random initialization.
+ keep_query_pos (`bool`, *optional*, defaults to `False`):
+ Whether to concatenate the projected positional embedding from the object query into the original query (key) in every decoder layer.
+ num_patterns (`int`, *optional*, defaults to 0):
+ Number of pattern embeddings.
+ normalize_before (`bool`, *optional*, defaults to `False`):
+ Whether we use a normalization layer in the Encoder or not.
+ sine_position_embedding_scale (`float`, *optional*, defaults to 'None'):
+ Scaling factor applied to the normalized positional encodings.
+ initializer_bias_prior_prob (`float`, *optional*):
+ The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
+ If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
+
+
+ Examples:
+
+ ```python
+ >>> from transformers import DabDetrConfig, DabDetrModel
+
+ >>> # Initializing a DAB-DETR IDEA-Research/dab_detr-base style configuration
+ >>> configuration = DabDetrConfig()
+
+ >>> # Initializing a model (with random weights) from the IDEA-Research/dab_detr-base style configuration
+ >>> model = DabDetrModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "dab-detr"
+ keys_to_ignore_at_inference = ["past_key_values"]
+ attribute_map = {
+ "num_attention_heads": "encoder_attention_heads",
+ }
+
+ def __init__(
+ self,
+ use_timm_backbone=True,
+ backbone_config=None,
+ backbone="resnet50",
+ use_pretrained_backbone=True,
+ backbone_kwargs=None,
+ num_queries=300,
+ encoder_layers=6,
+ encoder_ffn_dim=2048,
+ encoder_attention_heads=8,
+ decoder_layers=6,
+ decoder_ffn_dim=2048,
+ decoder_attention_heads=8,
+ is_encoder_decoder=True,
+ activation_function="prelu",
+ hidden_size=256,
+ dropout=0.1,
+ attention_dropout=0.0,
+ activation_dropout=0.0,
+ init_std=0.02,
+ init_xavier_std=1.0,
+ auxiliary_loss=False,
+ dilation=False,
+ class_cost=2,
+ bbox_cost=5,
+ giou_cost=2,
+ cls_loss_coefficient=2,
+ bbox_loss_coefficient=5,
+ giou_loss_coefficient=2,
+ focal_alpha=0.25,
+ temperature_height=20,
+ temperature_width=20,
+ query_dim=4,
+ random_refpoints_xy=False,
+ keep_query_pos=False,
+ num_patterns=0,
+ normalize_before=False,
+ sine_position_embedding_scale=None,
+ initializer_bias_prior_prob=None,
+ **kwargs,
+ ):
+ if query_dim != 4:
+ raise ValueError("The query dimensions has to be 4.")
+
+ # We default to values which were previously hard-coded in the model. This enables configurability of the config
+ # while keeping the default behavior the same.
+ if use_timm_backbone and backbone_kwargs is None:
+ backbone_kwargs = {}
+ if dilation:
+ backbone_kwargs["output_stride"] = 16
+ backbone_kwargs["out_indices"] = [1, 2, 3, 4]
+ backbone_kwargs["in_chans"] = 3 # num_channels
+ # Backwards compatibility
+ elif not use_timm_backbone and backbone in (None, "resnet50"):
+ if backbone_config is None:
+ logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+ backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+ elif isinstance(backbone_config, dict):
+ backbone_model_type = backbone_config.get("model_type")
+ config_class = CONFIG_MAPPING[backbone_model_type]
+ backbone_config = config_class.from_dict(backbone_config)
+ backbone = None
+ # set timm attributes to None
+ dilation = None
+
+ verify_backbone_config_arguments(
+ use_timm_backbone=use_timm_backbone,
+ use_pretrained_backbone=use_pretrained_backbone,
+ backbone=backbone,
+ backbone_config=backbone_config,
+ backbone_kwargs=backbone_kwargs,
+ )
+
+ self.use_timm_backbone = use_timm_backbone
+ self.backbone_config = backbone_config
+ self.num_queries = num_queries
+ self.hidden_size = hidden_size
+ self.encoder_ffn_dim = encoder_ffn_dim
+ self.encoder_layers = encoder_layers
+ self.encoder_attention_heads = encoder_attention_heads
+ self.decoder_ffn_dim = decoder_ffn_dim
+ self.decoder_layers = decoder_layers
+ self.decoder_attention_heads = decoder_attention_heads
+ self.dropout = dropout
+ self.attention_dropout = attention_dropout
+ self.activation_dropout = activation_dropout
+ self.activation_function = activation_function
+ self.init_std = init_std
+ self.init_xavier_std = init_xavier_std
+ self.num_hidden_layers = encoder_layers
+ self.auxiliary_loss = auxiliary_loss
+ self.backbone = backbone
+ self.use_pretrained_backbone = use_pretrained_backbone
+ self.backbone_kwargs = backbone_kwargs
+ # Hungarian matcher
+ self.class_cost = class_cost
+ self.bbox_cost = bbox_cost
+ self.giou_cost = giou_cost
+ # Loss coefficients
+ self.cls_loss_coefficient = cls_loss_coefficient
+ self.bbox_loss_coefficient = bbox_loss_coefficient
+ self.giou_loss_coefficient = giou_loss_coefficient
+ self.focal_alpha = focal_alpha
+ self.query_dim = query_dim
+ self.random_refpoints_xy = random_refpoints_xy
+ self.keep_query_pos = keep_query_pos
+ self.num_patterns = num_patterns
+ self.normalize_before = normalize_before
+ self.temperature_width = temperature_width
+ self.temperature_height = temperature_height
+ self.sine_position_embedding_scale = sine_position_embedding_scale
+ self.initializer_bias_prior_prob = initializer_bias_prior_prob
+ super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+
+__all__ = ["DabDetrConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae3a67710150c2cce79c719e6534bdbd9eaa0260
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DAB-DETR checkpoints."""
+
+import argparse
+import gc
+import json
+import re
+from pathlib import Path
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers import ConditionalDetrImageProcessor, DabDetrConfig, DabDetrForObjectDetection
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+ # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
+ # for dab-DETR, also convert reference point head and query scale MLP
+ r"input_proj\.(bias|weight)": r"input_projection.\1",
+ r"refpoint_embed\.weight": r"query_refpoint_embeddings.weight",
+ r"class_embed\.(bias|weight)": r"class_embed.\1",
+ # negative lookbehind because of the overlap
+ r"(?DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
+class DabDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
+ """
+ Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
+ BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
+ of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
+ decoding losses.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+ plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ the self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+ used to compute the weighted average in the cross-attention heads.
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+ Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+ layernorm.
+ reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
+ Reference points (reference points of each layer of the decoder).
+ """
+
+ intermediate_hidden_states: Optional[torch.FloatTensor] = None
+ reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrModelOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
+class DabDetrModelOutput(Seq2SeqModelOutput):
+ """
+ Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
+ Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
+ layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding
+ losses.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+ layer plus the initial embedding outputs.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+ weighted average in the self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+ used to compute the weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+ layer plus the initial embedding outputs.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+ weighted average in the self-attention heads.
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+ Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+ layernorm.
+ reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
+ Reference points (reference points of each layer of the decoder).
+ """
+
+ intermediate_hidden_states: Optional[torch.FloatTensor] = None
+ reference_points: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->DabDetr
+class DabDetrObjectDetectionOutput(ModelOutput):
+ """
+ Output type of [`DabDetrForObjectDetection`].
+
+ Args:
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+ Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+ bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+ scale-invariant IoU loss.
+ loss_dict (`Dict`, *optional*):
+ A dictionary containing the individual losses. Useful for logging.
+ logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+ Classification logits (including no-object) for all queries.
+ pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+ Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+ values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+ possible padding). You can use [`~DabDetrImageProcessor.post_process_object_detection`] to retrieve the
+ unnormalized bounding boxes.
+ auxiliary_outputs (`list[Dict]`, *optional*):
+ Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+ and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+ `pred_boxes`) for each decoder layer.
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+ layer plus the initial embedding outputs.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+ weighted average in the self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+ used to compute the weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+ layer plus the initial embedding outputs.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+ weighted average in the self-attention heads.
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ loss_dict: Optional[Dict] = None
+ logits: Optional[torch.FloatTensor] = None
+ pred_boxes: Optional[torch.FloatTensor] = None
+ auxiliary_outputs: Optional[List[Dict]] = None
+ last_hidden_state: Optional[torch.FloatTensor] = None
+ decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->DabDetr
+class DabDetrFrozenBatchNorm2d(nn.Module):
+ """
+ BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+ Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+ torchvision.models.resnet[18,34,50,101] produce nans.
+ """
+
+ def __init__(self, n):
+ super().__init__()
+ self.register_buffer("weight", torch.ones(n))
+ self.register_buffer("bias", torch.zeros(n))
+ self.register_buffer("running_mean", torch.zeros(n))
+ self.register_buffer("running_var", torch.ones(n))
+
+ def _load_from_state_dict(
+ self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+ ):
+ num_batches_tracked_key = prefix + "num_batches_tracked"
+ if num_batches_tracked_key in state_dict:
+ del state_dict[num_batches_tracked_key]
+
+ super()._load_from_state_dict(
+ state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+ )
+
+ def forward(self, x):
+ # move reshapes to the beginning
+ # to make it user-friendly
+ weight = self.weight.reshape(1, -1, 1, 1)
+ bias = self.bias.reshape(1, -1, 1, 1)
+ running_var = self.running_var.reshape(1, -1, 1, 1)
+ running_mean = self.running_mean.reshape(1, -1, 1, 1)
+ epsilon = 1e-5
+ scale = weight * (running_var + epsilon).rsqrt()
+ bias = bias - running_mean * scale
+ return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->DabDetr
+def replace_batch_norm(model):
+ r"""
+ Recursively replace all `torch.nn.BatchNorm2d` with `DabDetrFrozenBatchNorm2d`.
+
+ Args:
+ model (torch.nn.Module):
+ input model
+ """
+ for name, module in model.named_children():
+ if isinstance(module, nn.BatchNorm2d):
+ new_module = DabDetrFrozenBatchNorm2d(module.num_features)
+
+ if not module.weight.device == torch.device("meta"):
+ new_module.weight.data.copy_(module.weight)
+ new_module.bias.data.copy_(module.bias)
+ new_module.running_mean.data.copy_(module.running_mean)
+ new_module.running_var.data.copy_(module.running_var)
+
+ model._modules[name] = new_module
+
+ if len(list(module.children())) > 0:
+ replace_batch_norm(module)
+
+
+# Modified from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->DabDetr
+class DabDetrConvEncoder(nn.Module):
+ """
+ Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+
+ nn.BatchNorm2d layers are replaced by DabDetrFrozenBatchNorm2d as defined above.
+
+ """
+
+ def __init__(self, config: DabDetrConfig):
+ super().__init__()
+
+ self.config = config
+ backbone = load_backbone(config)
+
+ # replace batch norm by frozen batch norm
+ with torch.no_grad():
+ replace_batch_norm(backbone)
+ self.model = backbone
+ self.intermediate_channel_sizes = self.model.channels
+
+ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+ # send pixel_values through the model to get list of feature maps
+ features = self.model(pixel_values).feature_maps
+
+ out = []
+ for feature_map in features:
+ # downsample pixel_mask to match shape of corresponding feature_map
+ mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+ out.append((feature_map, mask))
+ return out
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->DabDetr
+class DabDetrConvModel(nn.Module):
+ """
+ This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+ """
+
+ def __init__(self, conv_encoder, position_embedding):
+ super().__init__()
+ self.conv_encoder = conv_encoder
+ self.position_embedding = position_embedding
+
+ def forward(self, pixel_values, pixel_mask):
+ # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+ out = self.conv_encoder(pixel_values, pixel_mask)
+ pos = []
+ for feature_map, mask in out:
+ # position encoding
+ pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+ return out, pos
+
+
+# Modified from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrSinePositionEmbedding with ConditionalDetr->DabDetr
+class DabDetrSinePositionEmbedding(nn.Module):
+ """
+ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+ need paper, generalized to work on images.
+ """
+
+ def __init__(self, config: DabDetrConfig):
+ super().__init__()
+ self.config = config
+ self.embedding_dim = config.hidden_size / 2
+ self.temperature_height = config.temperature_height
+ self.temperature_width = config.temperature_width
+ scale = config.sine_position_embedding_scale
+ if scale is None:
+ scale = 2 * math.pi
+ self.scale = scale
+
+ def forward(self, pixel_values, pixel_mask):
+ if pixel_mask is None:
+ raise ValueError("No pixel mask provided")
+ y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+ x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+ y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
+ x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
+
+ # We use float32 to ensure reproducibility of the original implementation
+ dim_tx = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+ # Modifying dim_tx in place to avoid extra memory allocation -> dim_tx = self.temperature_width ** (2 * (dim_tx // 2) / self.embedding_dim)
+ dim_tx //= 2
+ dim_tx.mul_(2 / self.embedding_dim)
+ dim_tx.copy_(self.temperature_width**dim_tx)
+ pos_x = x_embed[:, :, :, None] / dim_tx
+
+ # We use float32 to ensure reproducibility of the original implementation
+ dim_ty = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+ # Modifying dim_ty in place to avoid extra memory allocation -> dim_ty = self.temperature_height ** (2 * (dim_ty // 2) / self.embedding_dim)
+ dim_ty //= 2
+ dim_ty.mul_(2 / self.embedding_dim)
+ dim_ty.copy_(self.temperature_height**dim_ty)
+ pos_y = y_embed[:, :, :, None] / dim_ty
+
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+ return pos
+
+
+# function to generate sine positional embedding for 4d coordinates
+def gen_sine_position_embeddings(pos_tensor, hidden_size=256):
+ """
+ This function computes position embeddings using sine and cosine functions from the input positional tensor,
+ which has a shape of (batch_size, num_queries, 4).
+ The last dimension of `pos_tensor` represents the following coordinates:
+ - 0: x-coord
+ - 1: y-coord
+ - 2: width
+ - 3: height
+
+ The output shape is (batch_size, num_queries, 512), where final dim (hidden_size*2 = 512) is the total embedding dimension
+ achieved by concatenating the sine and cosine values for each coordinate.
+ """
+ scale = 2 * math.pi
+ dim = hidden_size // 2
+ dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
+ dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / dim)
+ x_embed = pos_tensor[:, :, 0] * scale
+ y_embed = pos_tensor[:, :, 1] * scale
+ pos_x = x_embed[:, :, None] / dim_t
+ pos_y = y_embed[:, :, None] / dim_t
+ pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+ pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+ if pos_tensor.size(-1) == 4:
+ w_embed = pos_tensor[:, :, 2] * scale
+ pos_w = w_embed[:, :, None] / dim_t
+ pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+
+ h_embed = pos_tensor[:, :, 3] * scale
+ pos_h = h_embed[:, :, None] / dim_t
+ pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+
+ pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+ else:
+ raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
+ return pos
+
+
+def inverse_sigmoid(x, eps=1e-5):
+ x = x.clamp(min=0, max=1)
+ x1 = x.clamp(min=eps)
+ x2 = (1 - x).clamp(min=eps)
+ return torch.log(x1 / x2)
+
+
+# Modified from transformers.models.detr.modeling_detr.DetrAttention
+class DetrAttention(nn.Module):
+ """
+ Multi-headed attention from 'Attention Is All You Need' paper.
+
+ Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
+ """
+
+ def __init__(
+ self,
+ config: DabDetrConfig,
+ bias: bool = True,
+ ):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.encoder_attention_heads
+ self.attention_dropout = config.attention_dropout
+ self.head_dim = self.hidden_size // self.num_heads
+ if self.head_dim * self.num_heads != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
+ f" {self.num_heads})."
+ )
+ self.scaling = self.head_dim**-0.5
+ self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=bias)
+ self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=bias)
+ self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=bias)
+ self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=bias)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ object_queries: Optional[torch.Tensor] = None,
+ key_value_states: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+ batch_size, q_len, embed_dim = hidden_states.size()
+ # add position embeddings to the hidden states before projecting to queries and keys
+ if object_queries is not None:
+ hidden_states_original = hidden_states
+ hidden_states = hidden_states + object_queries
+
+ query_states = self.q_proj(hidden_states) * self.scaling
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states_original)
+
+ query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+
+ if attention_mask is not None:
+ attn_weights = attn_weights + attention_mask
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ attn_output = attn_output.reshape(batch_size, q_len, embed_dim)
+ attn_output = self.out_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights
+
+
+# Modified from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrAttention with ConditionalDetr->DABDETR,Conditional DETR->DabDetr
+class DabDetrAttention(nn.Module):
+ """
+ Cross-Attention used in DAB-DETR 'DAB-DETR for Fast Training Convergence' paper.
+
+ The key q_proj, k_proj, v_proj are defined outside the attention. This attention allows the dim of q, k to be
+ different to v.
+ """
+
+ def __init__(self, config: DabDetrConfig, bias: bool = True, is_cross: bool = False):
+ super().__init__()
+ self.config = config
+ self.embed_dim = config.hidden_size * 2 if is_cross else config.hidden_size
+ self.output_dim = config.hidden_size
+ self.attention_heads = config.decoder_attention_heads
+ self.attention_dropout = config.attention_dropout
+ self.attention_head_dim = self.embed_dim // self.attention_heads
+ if self.attention_head_dim * self.attention_heads != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `attention_heads`:"
+ f" {self.attention_heads})."
+ )
+ # head dimension of values
+ self.values_head_dim = self.output_dim // self.attention_heads
+ if self.values_head_dim * self.attention_heads != self.output_dim:
+ raise ValueError(
+ f"output_dim must be divisible by attention_heads (got `output_dim`: {self.output_dim} and `attention_heads`: {self.attention_heads})."
+ )
+ self.scaling = self.attention_head_dim**-0.5
+ self.output_proj = nn.Linear(self.output_dim, self.output_dim, bias=bias)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ key_states: Optional[torch.Tensor] = None,
+ value_states: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+
+ batch_size, q_len, _ = hidden_states.size()
+
+ # scaling query and refactor key-, value states
+ query_states = hidden_states * self.scaling
+ query_states = query_states.view(batch_size, -1, self.attention_heads, self.attention_head_dim).transpose(1, 2)
+ key_states = key_states.view(batch_size, -1, self.attention_heads, self.attention_head_dim).transpose(1, 2)
+ value_states = value_states.view(batch_size, -1, self.attention_heads, self.values_head_dim).transpose(1, 2)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+
+ if attention_mask is not None:
+ attn_weights = attn_weights + attention_mask
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_probs = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+ attn_output = torch.matmul(attn_probs, value_states)
+
+ if attn_output.size() != (batch_size, self.attention_heads, q_len, self.values_head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(batch_size, self.attention_heads, q_len, self.values_head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ attn_output = attn_output.reshape(batch_size, q_len, self.output_dim)
+ attn_output = self.output_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights
+
+
+class DabDetrDecoderLayerSelfAttention(nn.Module):
+ def __init__(self, config: DabDetrConfig):
+ super().__init__()
+ self.dropout = config.dropout
+ self.self_attn_query_content_proj = nn.Linear(config.hidden_size, config.hidden_size)
+ self.self_attn_query_pos_proj = nn.Linear(config.hidden_size, config.hidden_size)
+ self.self_attn_key_content_proj = nn.Linear(config.hidden_size, config.hidden_size)
+ self.self_attn_key_pos_proj = nn.Linear(config.hidden_size, config.hidden_size)
+ self.self_attn_value_proj = nn.Linear(config.hidden_size, config.hidden_size)
+ self.self_attn = DabDetrAttention(config)
+ self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ query_position_embeddings: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ ):
+ residual = hidden_states
+ query_content = self.self_attn_query_content_proj(hidden_states)
+ query_pos = self.self_attn_query_pos_proj(query_position_embeddings)
+ key_content = self.self_attn_key_content_proj(hidden_states)
+ key_pos = self.self_attn_key_pos_proj(query_position_embeddings)
+ value = self.self_attn_value_proj(hidden_states)
+
+ query = query_content + query_pos
+ key = key_content + key_pos
+
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=query,
+ attention_mask=attention_mask,
+ key_states=key,
+ value_states=value,
+ output_attentions=True,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ return hidden_states, attn_weights
+
+
+class DabDetrDecoderLayerCrossAttention(nn.Module):
+ def __init__(self, config: DabDetrConfig, is_first: bool = False):
+ super().__init__()
+ hidden_size = config.hidden_size
+ self.cross_attn_query_content_proj = nn.Linear(hidden_size, hidden_size)
+ self.cross_attn_query_pos_proj = nn.Linear(hidden_size, hidden_size)
+ self.cross_attn_key_content_proj = nn.Linear(hidden_size, hidden_size)
+ self.cross_attn_key_pos_proj = nn.Linear(hidden_size, hidden_size)
+ self.cross_attn_value_proj = nn.Linear(hidden_size, hidden_size)
+ self.cross_attn_query_pos_sine_proj = nn.Linear(hidden_size, hidden_size)
+ self.decoder_attention_heads = config.decoder_attention_heads
+ self.cross_attn_layer_norm = nn.LayerNorm(hidden_size)
+ self.cross_attn = DabDetrAttention(config, is_cross=True)
+
+ self.keep_query_pos = config.keep_query_pos
+
+ if not self.keep_query_pos and not is_first:
+ self.cross_attn_query_pos_proj = None
+
+ self.is_first = is_first
+ self.dropout = config.dropout
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ query_position_embeddings: Optional[torch.Tensor] = None,
+ object_queries: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ query_sine_embed: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ ):
+ query_content = self.cross_attn_query_content_proj(hidden_states)
+ key_content = self.cross_attn_key_content_proj(encoder_hidden_states)
+ value = self.cross_attn_value_proj(encoder_hidden_states)
+
+ batch_size, num_queries, n_model = query_content.shape
+ _, height_width, _ = key_content.shape
+
+ key_pos = self.cross_attn_key_pos_proj(object_queries)
+
+ # For the first decoder layer, we add the positional embedding predicted from
+ # the object query (the positional embedding) into the original query (key) in DETR.
+ if self.is_first or self.keep_query_pos:
+ query_pos = self.cross_attn_query_pos_proj(query_position_embeddings)
+ query = query_content + query_pos
+ key = key_content + key_pos
+ else:
+ query = query_content
+ key = key_content
+
+ query = query.view(
+ batch_size, num_queries, self.decoder_attention_heads, n_model // self.decoder_attention_heads
+ )
+ query_sine_embed = self.cross_attn_query_pos_sine_proj(query_sine_embed)
+ query_sine_embed = query_sine_embed.view(
+ batch_size, num_queries, self.decoder_attention_heads, n_model // self.decoder_attention_heads
+ )
+ query = torch.cat([query, query_sine_embed], dim=3).view(batch_size, num_queries, n_model * 2)
+ key = key.view(batch_size, height_width, self.decoder_attention_heads, n_model // self.decoder_attention_heads)
+ key_pos = key_pos.view(
+ batch_size, height_width, self.decoder_attention_heads, n_model // self.decoder_attention_heads
+ )
+ key = torch.cat([key, key_pos], dim=3).view(batch_size, height_width, n_model * 2)
+
+ # Cross-Attention Block
+ cross_attn_weights = None
+ if encoder_hidden_states is not None:
+ residual = hidden_states
+
+ hidden_states, cross_attn_weights = self.cross_attn(
+ hidden_states=query,
+ attention_mask=encoder_attention_mask,
+ key_states=key,
+ value_states=value,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.cross_attn_layer_norm(hidden_states)
+
+ return hidden_states, cross_attn_weights
+
+
+class DabDetrDecoderLayerFFN(nn.Module):
+ def __init__(self, config: DabDetrConfig):
+ super().__init__()
+ hidden_size = config.hidden_size
+ self.final_layer_norm = nn.LayerNorm(hidden_size)
+ self.fc1 = nn.Linear(hidden_size, config.decoder_ffn_dim)
+ self.fc2 = nn.Linear(config.decoder_ffn_dim, hidden_size)
+ self.activation_fn = ACT2FN[config.activation_function]
+ self.dropout = config.dropout
+ self.activation_dropout = config.activation_dropout
+ self.keep_query_pos = config.keep_query_pos
+
+ def forward(self, hidden_states: torch.Tensor):
+ residual = hidden_states
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+ hidden_states = self.fc2(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ return hidden_states
+
+
+# Modified from transformers.models.detr.modeling_detr.DetrEncoderLayer with DetrEncoderLayer->DabDetrEncoderLayer,DetrConfig->DabDetrConfig
+class DabDetrEncoderLayer(nn.Module):
+ def __init__(self, config: DabDetrConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = DetrAttention(config)
+ self.self_attn_layer_norm = nn.LayerNorm(self.hidden_size)
+ self.dropout = config.dropout
+ self.activation_fn = ACT2FN[config.activation_function]
+ self.fc1 = nn.Linear(self.hidden_size, config.encoder_ffn_dim)
+ self.fc2 = nn.Linear(config.encoder_ffn_dim, self.hidden_size)
+ self.final_layer_norm = nn.LayerNorm(self.hidden_size)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ object_queries: torch.Tensor,
+ output_attentions: Optional[bool] = None,
+ ):
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, source_len)` where padding elements are indicated by very large negative
+ values.
+ object_queries (`torch.FloatTensor`, *optional*):
+ Object queries (also called content embeddings), to be added to the hidden states.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ object_queries=object_queries,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ residual = hidden_states
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ hidden_states = self.fc2(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ hidden_states = residual + hidden_states
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+# Modified from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrDecoderLayer with ConditionalDetr->DabDetr
+class DabDetrDecoderLayer(nn.Module):
+ def __init__(self, config: DabDetrConfig, is_first: bool = False):
+ super().__init__()
+ self.self_attn = DabDetrDecoderLayerSelfAttention(config)
+ self.cross_attn = DabDetrDecoderLayerCrossAttention(config, is_first)
+ self.mlp = DabDetrDecoderLayerFFN(config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ object_queries: Optional[torch.Tensor] = None,
+ query_position_embeddings: Optional[torch.Tensor] = None,
+ query_sine_embed: Optional[torch.Tensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ ):
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+ values.
+ object_queries (`torch.FloatTensor`, *optional*):
+ object_queries that are added to the queries and keys
+ in the cross-attention layer.
+ query_position_embeddings (`torch.FloatTensor`, *optional*):
+ object_queries that are added to the queries and keys
+ in the self-attention layer.
+ encoder_hidden_states (`torch.FloatTensor`):
+ cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+ values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+
+ """
+ hidden_states, self_attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ query_position_embeddings=query_position_embeddings,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states, cross_attn_weights = self.cross_attn(
+ hidden_states=hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ query_position_embeddings=query_position_embeddings,
+ object_queries=object_queries,
+ encoder_attention_mask=encoder_attention_mask,
+ query_sine_embed=query_sine_embed,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = self.mlp(hidden_states=hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights, cross_attn_weights)
+
+ return outputs
+
+
+# Modified from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with DetrMLPPredictionHead->DabDetrMLP
+class DabDetrMLP(nn.Module):
+ """
+ Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+ height and width of a bounding box w.r.t. an image.
+
+ Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+ """
+
+ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+ super().__init__()
+ self.num_layers = num_layers
+ h = [hidden_dim] * (num_layers - 1)
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+ def forward(self, input_tensor):
+ for i, layer in enumerate(self.layers):
+ input_tensor = nn.functional.relu(layer(input_tensor)) if i < self.num_layers - 1 else layer(input_tensor)
+ return input_tensor
+
+
+# Modified from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->DabDetr
+class DabDetrPreTrainedModel(PreTrainedModel):
+ config_class = DabDetrConfig
+ base_model_prefix = "model"
+ main_input_name = "pixel_values"
+ _no_split_modules = [r"DabDetrConvEncoder", r"DabDetrEncoderLayer", r"DabDetrDecoderLayer"]
+
+ def _init_weights(self, module):
+ std = self.config.init_std
+ xavier_std = self.config.init_xavier_std
+
+ if isinstance(module, DabDetrMHAttentionMap):
+ nn.init.zeros_(module.k_linear.bias)
+ nn.init.zeros_(module.q_linear.bias)
+ nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
+ nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
+ if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, DabDetrForObjectDetection):
+ nn.init.constant_(module.bbox_predictor.layers[-1].weight.data, 0)
+ nn.init.constant_(module.bbox_predictor.layers[-1].bias.data, 0)
+
+ # init prior_prob setting for focal loss
+ prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+ bias_value = -math.log((1 - prior_prob) / prior_prob)
+ module.class_embed.bias.data.fill_(bias_value)
+
+
+DAB_DETR_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`DabDetrConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DAB_DETR_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it.
+
+ Pixel values can be obtained using [`AutoImageProcessor`]. See [`DetrImageProcessor.__call__`]
+ for details.
+
+ pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+ Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+ - 1 for pixels that are real (i.e. **not masked**),
+ - 0 for pixels that are padding (i.e. **masked**).
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+ Not used by default. Can be used to mask object queries.
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+ `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+ hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+ can choose to directly pass a flattened representation of an image.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+ embedded representation.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR
+class DabDetrEncoder(DabDetrPreTrainedModel):
+ """
+ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+ [`DabDetrEncoderLayer`].
+
+ The encoder updates the flattened feature map through multiple self-attention layers.
+
+ Small tweak for DAB-DETR:
+
+ - object_queries are added to the forward pass.
+
+ Args:
+ config: DabDetrConfig
+ """
+
+ def __init__(self, config: DabDetrConfig):
+ super().__init__(config)
+
+ self.dropout = config.dropout
+ self.query_scale = DabDetrMLP(config.hidden_size, config.hidden_size, config.hidden_size, 2)
+ self.layers = nn.ModuleList([DabDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
+ self.norm = nn.LayerNorm(config.hidden_size) if config.normalize_before else None
+ self.gradient_checkpointing = False
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def forward(
+ self,
+ inputs_embeds,
+ attention_mask,
+ object_queries,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ):
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`):
+ Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+
+ - 1 for pixel features that are real (i.e. **not masked**),
+ - 0 for pixel features that are padding (i.e. **masked**).
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ object_queries (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`):
+ Object queries that are added to the queries in each self-attention layer.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ hidden_states = inputs_embeds
+
+ # expand attention_mask
+ if attention_mask is not None:
+ # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+ attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ for encoder_layer in self.layers:
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ # pos scaler
+ pos_scales = self.query_scale(hidden_states)
+ # we add object_queries * pos_scaler as extra input to the encoder_layer
+ scaled_object_queries = object_queries * pos_scales
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ encoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ scaled_object_queries,
+ output_attentions,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ object_queries=scaled_object_queries,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if self.norm:
+ hidden_states = self.norm(hidden_states)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+# Modified from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrDecoder with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR
+class DabDetrDecoder(DabDetrPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DabDetrDecoderLayer`].
+
+ The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+ Some small tweaks for DAB-DETR:
+
+ - object_queries and query_position_embeddings are added to the forward pass.
+ - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+
+ Args:
+ config: DabDetrConfig
+ """
+
+ def __init__(self, config: DabDetrConfig):
+ super().__init__(config)
+ self.config = config
+ self.dropout = config.dropout
+ self.num_layers = config.decoder_layers
+ self.gradient_checkpointing = False
+
+ self.layers = nn.ModuleList(
+ [DabDetrDecoderLayer(config, is_first=(layer_id == 0)) for layer_id in range(config.decoder_layers)]
+ )
+ # in DAB-DETR, the decoder uses layernorm after the last decoder layer output
+ self.hidden_size = config.hidden_size
+ self.layernorm = nn.LayerNorm(self.hidden_size)
+
+ # Default cond-elewise
+ self.query_scale = DabDetrMLP(self.hidden_size, self.hidden_size, self.hidden_size, 2)
+
+ self.ref_point_head = DabDetrMLP(
+ config.query_dim // 2 * self.hidden_size, self.hidden_size, self.hidden_size, 2
+ )
+
+ self.bbox_embed = None
+
+ # Default decoder_modulate_hw_attn is True
+ self.ref_anchor_head = DabDetrMLP(self.hidden_size, self.hidden_size, 2, 2)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def forward(
+ self,
+ inputs_embeds,
+ encoder_hidden_states,
+ memory_key_padding_mask,
+ object_queries,
+ query_position_embeddings,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ):
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`):
+ The query embeddings that are passed into the decoder.
+ encoder_hidden_states (`torch.FloatTensor` of shape `(encoder_sequence_length, batch_size, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+ of the decoder.
+ memory_key_padding_mask (`torch.Tensor.bool` of shape `(batch_size, sequence_length)`):
+ The memory_key_padding_mask indicates which positions in the memory (encoder outputs) should be ignored during the attention computation,
+ ensuring padding tokens do not influence the attention mechanism.
+ object_queries (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`, *optional*):
+ Position embeddings that are added to the queries and keys in each cross-attention layer.
+ query_position_embeddings (`torch.FloatTensor` of shape `(num_queries, batch_size, number_of_anchor_points)`):
+ Position embeddings that are added to the queries and keys in each self-attention layer.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if inputs_embeds is not None:
+ hidden_states = inputs_embeds
+ input_shape = inputs_embeds.size()[:-1]
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+ intermediate = []
+ reference_points = query_position_embeddings.sigmoid()
+ ref_points = [reference_points]
+
+ # expand encoder attention mask
+ if encoder_hidden_states is not None and memory_key_padding_mask is not None:
+ # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+ memory_key_padding_mask = _prepare_4d_attention_mask(
+ memory_key_padding_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+ )
+
+ for layer_id, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ obj_center = reference_points[..., : self.config.query_dim]
+ query_sine_embed = gen_sine_position_embeddings(obj_center, self.hidden_size)
+ query_pos = self.ref_point_head(query_sine_embed)
+
+ # For the first decoder layer, we do not apply transformation over p_s
+ pos_transformation = 1 if layer_id == 0 else self.query_scale(hidden_states)
+
+ # apply transformation
+ query_sine_embed = query_sine_embed[..., : self.hidden_size] * pos_transformation
+
+ # modulated Height Width attentions
+ reference_anchor_size = self.ref_anchor_head(hidden_states).sigmoid() # nq, bs, 2
+ query_sine_embed[..., self.hidden_size // 2 :] *= (
+ reference_anchor_size[..., 0] / obj_center[..., 2]
+ ).unsqueeze(-1)
+ query_sine_embed[..., : self.hidden_size // 2] *= (
+ reference_anchor_size[..., 1] / obj_center[..., 3]
+ ).unsqueeze(-1)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ None,
+ object_queries,
+ query_pos,
+ query_sine_embed,
+ encoder_hidden_states,
+ memory_key_padding_mask,
+ output_attentions,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=None,
+ object_queries=object_queries,
+ query_position_embeddings=query_pos,
+ query_sine_embed=query_sine_embed,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=memory_key_padding_mask,
+ output_attentions=output_attentions,
+ )
+
+ # iter update
+ hidden_states = layer_outputs[0]
+
+ if self.bbox_embed is not None:
+ new_reference_points = self.bbox_embed(hidden_states)
+
+ new_reference_points[..., : self.config.query_dim] += inverse_sigmoid(reference_points)
+ new_reference_points = new_reference_points[..., : self.config.query_dim].sigmoid()
+ if layer_id != self.num_layers - 1:
+ ref_points.append(new_reference_points)
+ reference_points = new_reference_points.detach()
+
+ intermediate.append(self.layernorm(hidden_states))
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ if encoder_hidden_states is not None:
+ all_cross_attentions += (layer_outputs[2],)
+
+ # Layer normalization on hidden states
+ hidden_states = self.layernorm(hidden_states)
+
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ output_intermediate_hidden_states = torch.stack(intermediate)
+ output_reference_points = torch.stack(ref_points)
+
+ if not return_dict:
+ return tuple(
+ v
+ for v in [
+ hidden_states,
+ all_hidden_states,
+ all_self_attns,
+ all_cross_attentions,
+ output_intermediate_hidden_states,
+ output_reference_points,
+ ]
+ if v is not None
+ )
+ return DabDetrDecoderOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ cross_attentions=all_cross_attentions,
+ intermediate_hidden_states=output_intermediate_hidden_states,
+ reference_points=output_reference_points,
+ )
+
+
+@add_start_docstrings(
+ """
+ The bare DAB-DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+ hidden-states, intermediate hidden states, reference points, output coordinates without any specific head on top.
+ """,
+ DAB_DETR_START_DOCSTRING,
+)
+class DabDetrModel(DabDetrPreTrainedModel):
+ def __init__(self, config: DabDetrConfig):
+ super().__init__(config)
+
+ self.auxiliary_loss = config.auxiliary_loss
+
+ # Create backbone + positional encoding
+ self.backbone = DabDetrConvEncoder(config)
+ object_queries = DabDetrSinePositionEmbedding(config)
+
+ self.query_refpoint_embeddings = nn.Embedding(config.num_queries, config.query_dim)
+ self.random_refpoints_xy = config.random_refpoints_xy
+ if self.random_refpoints_xy:
+ self.query_refpoint_embeddings.weight.data[:, :2].uniform_(0, 1)
+ self.query_refpoint_embeddings.weight.data[:, :2] = inverse_sigmoid(
+ self.query_refpoint_embeddings.weight.data[:, :2]
+ )
+ self.query_refpoint_embeddings.weight.data[:, :2].requires_grad = False
+
+ # Create projection layer
+ self.input_projection = nn.Conv2d(
+ self.backbone.intermediate_channel_sizes[-1], config.hidden_size, kernel_size=1
+ )
+ self.backbone = DabDetrConvModel(self.backbone, object_queries)
+
+ self.encoder = DabDetrEncoder(config)
+ self.decoder = DabDetrDecoder(config)
+
+ # decoder related variables
+ self.hidden_size = config.hidden_size
+ self.num_queries = config.num_queries
+
+ self.num_patterns = config.num_patterns
+ if not isinstance(self.num_patterns, int):
+ logger.warning("num_patterns should be int but {}".format(type(self.num_patterns)))
+ self.num_patterns = 0
+ if self.num_patterns > 0:
+ self.patterns = nn.Embedding(self.num_patterns, self.hidden_size)
+
+ self.aux_loss = config.auxiliary_loss
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_encoder(self):
+ return self.encoder
+
+ def get_decoder(self):
+ return self.decoder
+
+ def freeze_backbone(self):
+ for name, param in self.backbone.conv_encoder.model.named_parameters():
+ param.requires_grad_(False)
+
+ def unfreeze_backbone(self):
+ for name, param in self.backbone.conv_encoder.model.named_parameters():
+ param.requires_grad_(True)
+
+ @add_start_docstrings_to_model_forward(DAB_DETR_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=DabDetrModelOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: torch.FloatTensor,
+ pixel_mask: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ encoder_outputs: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.FloatTensor], DabDetrModelOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, AutoModel
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("IDEA-Research/dab_detr-base")
+ >>> model = AutoModel.from_pretrained("IDEA-Research/dab_detr-base")
+
+ >>> # prepare image for the model
+ >>> inputs = image_processor(images=image, return_tensors="pt")
+
+ >>> # forward pass
+ >>> outputs = model(**inputs)
+
+ >>> # the last hidden states are the final query embeddings of the Transformer decoder
+ >>> # these are of shape (batch_size, num_queries, hidden_size)
+ >>> last_hidden_states = outputs.last_hidden_state
+ >>> list(last_hidden_states.shape)
+ [1, 300, 256]
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ batch_size, _, height, width = pixel_values.shape
+ device = pixel_values.device
+
+ if pixel_mask is None:
+ pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+ # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+ # pixel_values should be of shape (batch_size, num_channels, height, width)
+ # pixel_mask should be of shape (batch_size, height, width)
+ features, object_queries_list = self.backbone(pixel_values, pixel_mask)
+
+ # get final feature map and downsampled mask
+ feature_map, mask = features[-1]
+
+ if mask is None:
+ raise ValueError("Backbone does not return downsampled pixel mask")
+
+ flattened_mask = mask.flatten(1)
+
+ # Second, apply 1x1 convolution to reduce the channel dimension to hidden_size (256 by default)
+ projected_feature_map = self.input_projection(feature_map)
+
+ # Third, flatten the feature map + object_queries of shape NxCxHxW to HWxNxC, and permute it to NxHWxC
+ # In other words, turn their shape into ( sequence_length, batch_size, hidden_size)
+ flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+ object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
+ reference_position_embeddings = self.query_refpoint_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+
+ # Fourth, sent flattened_features + flattened_mask + object_queries through encoder
+ # flattened_features is a Tensor of shape (heigth*width, batch_size, hidden_size)
+ # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+ if encoder_outputs is None:
+ encoder_outputs = self.encoder(
+ inputs_embeds=flattened_features,
+ attention_mask=flattened_mask,
+ object_queries=object_queries,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+ )
+
+ # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output)
+ num_queries = reference_position_embeddings.shape[1]
+ if self.num_patterns == 0:
+ queries = torch.zeros(batch_size, num_queries, self.hidden_size, device=device)
+ else:
+ queries = (
+ self.patterns.weight[:, None, None, :]
+ .repeat(1, self.num_queries, batch_size, 1)
+ .flatten(0, 1)
+ .permute(1, 0, 2)
+ ) # bs, n_q*n_pat, hidden_size
+ reference_position_embeddings = reference_position_embeddings.repeat(
+ 1, self.num_patterns, 1
+ ) # bs, n_q*n_pat, hidden_size
+
+ # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+ decoder_outputs = self.decoder(
+ inputs_embeds=queries,
+ query_position_embeddings=reference_position_embeddings,
+ object_queries=object_queries,
+ encoder_hidden_states=encoder_outputs[0],
+ memory_key_padding_mask=flattened_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ if not return_dict:
+ # last_hidden_state
+ output = (decoder_outputs[0],)
+ reference_points = decoder_outputs[-1]
+ intermediate_hidden_states = decoder_outputs[-2]
+
+ # it has to follow the order of DABDETRModelOutput that is based on ModelOutput
+ # If we only use one of the variables then the indexing will change.
+ # E.g: if we return everything then 'decoder_attentions' is decoder_outputs[2], if we only use output_attentions then its decoder_outputs[1]
+ if output_hidden_states and output_attentions:
+ output += (
+ decoder_outputs[1],
+ decoder_outputs[2],
+ decoder_outputs[3],
+ encoder_outputs[0],
+ encoder_outputs[1],
+ encoder_outputs[2],
+ )
+ elif output_hidden_states:
+ # decoder_hidden_states, encoder_last_hidden_state, encoder_hidden_states
+ output += (
+ decoder_outputs[1],
+ encoder_outputs[0],
+ encoder_outputs[1],
+ )
+ elif output_attentions:
+ # decoder_self_attention, decoder_cross_attention, encoder_attentions
+ output += (
+ decoder_outputs[1],
+ decoder_outputs[2],
+ encoder_outputs[1],
+ )
+
+ output += (intermediate_hidden_states, reference_points)
+
+ return output
+
+ reference_points = decoder_outputs.reference_points
+ intermediate_hidden_states = decoder_outputs.intermediate_hidden_states
+
+ return DabDetrModelOutput(
+ last_hidden_state=decoder_outputs.last_hidden_state,
+ decoder_hidden_states=decoder_outputs.hidden_states if output_hidden_states else None,
+ decoder_attentions=decoder_outputs.attentions if output_attentions else None,
+ cross_attentions=decoder_outputs.cross_attentions if output_attentions else None,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state if output_hidden_states else None,
+ encoder_hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
+ encoder_attentions=encoder_outputs.attentions if output_attentions else None,
+ intermediate_hidden_states=intermediate_hidden_states,
+ reference_points=reference_points,
+ )
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->DabDetr
+class DabDetrMHAttentionMap(nn.Module):
+ """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+ def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
+ super().__init__()
+ self.num_heads = num_heads
+ self.hidden_dim = hidden_dim
+ self.dropout = nn.Dropout(dropout)
+
+ self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+ self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+
+ self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+ def forward(self, q, k, mask: Optional[Tensor] = None):
+ q = self.q_linear(q)
+ k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+ queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+ keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+ weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
+
+ if mask is not None:
+ weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
+ weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
+ weights = self.dropout(weights)
+ return weights
+
+
+@add_start_docstrings(
+ """
+ DAB_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
+ top, for tasks such as COCO detection.
+ """,
+ DAB_DETR_START_DOCSTRING,
+)
+class DabDetrForObjectDetection(DabDetrPreTrainedModel):
+ # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+ _tied_weights_keys = [
+ r"bbox_predictor\.layers\.\d+\.(weight|bias)",
+ r"model\.decoder\.bbox_embed\.layers\.\d+\.(weight|bias)",
+ ]
+
+ def __init__(self, config: DabDetrConfig):
+ super().__init__(config)
+
+ self.config = config
+ self.auxiliary_loss = config.auxiliary_loss
+ self.query_dim = config.query_dim
+ # DAB-DETR encoder-decoder model
+ self.model = DabDetrModel(config)
+
+ _bbox_embed = DabDetrMLP(config.hidden_size, config.hidden_size, 4, 3)
+ # Object detection heads
+ self.class_embed = nn.Linear(config.hidden_size, config.num_labels)
+
+ # Default bbox_embed_diff_each_layer is False
+ self.bbox_predictor = _bbox_embed
+
+ # Default iter_update is True
+ self.model.decoder.bbox_embed = self.bbox_predictor
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/dab_detr.py
+ @torch.jit.unused
+ def _set_aux_loss(self, outputs_class, outputs_coord):
+ # this is a workaround to make torchscript happy, as torchscript
+ # doesn't support dictionary with non-homogeneous values, such
+ # as a dict having both a Tensor and a list.
+ return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+ @add_start_docstrings_to_model_forward(DAB_DETR_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=DabDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: torch.FloatTensor,
+ pixel_mask: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ encoder_outputs: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[List[dict]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.FloatTensor], DabDetrObjectDetectionOutput]:
+ r"""
+ labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+ Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+ following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+ in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("IDEA-Research/dab-detr-resnet-50")
+ >>> model = AutoModelForObjectDetection.from_pretrained("IDEA-Research/dab-detr-resnet-50")
+
+ >>> inputs = image_processor(images=image, return_tensors="pt")
+
+ >>> with torch.no_grad():
+ >>> outputs = model(**inputs)
+
+ >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+ >>> target_sizes = torch.tensor([(image.height, image.width)])
+ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]
+ >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+ ... box = [round(i, 2) for i in box.tolist()]
+ ... print(
+ ... f"Detected {model.config.id2label[label.item()]} with confidence "
+ ... f"{round(score.item(), 3)} at location {box}"
+ ... )
+ Detected remote with confidence 0.833 at location [38.31, 72.1, 177.63, 118.45]
+ Detected cat with confidence 0.831 at location [9.2, 51.38, 321.13, 469.0]
+ Detected cat with confidence 0.804 at location [340.3, 16.85, 642.93, 370.95]
+ Detected remote with confidence 0.683 at location [334.48, 73.49, 366.37, 190.01]
+ Detected couch with confidence 0.535 at location [0.52, 1.19, 640.35, 475.1]
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # First, sent images through DAB_DETR base model to obtain encoder + decoder outputs
+ model_outputs = self.model(
+ pixel_values,
+ pixel_mask=pixel_mask,
+ decoder_attention_mask=decoder_attention_mask,
+ encoder_outputs=encoder_outputs,
+ inputs_embeds=inputs_embeds,
+ decoder_inputs_embeds=decoder_inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ reference_points = model_outputs.reference_points if return_dict else model_outputs[-1]
+ intermediate_hidden_states = model_outputs.intermediate_hidden_states if return_dict else model_outputs[-2]
+
+ # class logits + predicted bounding boxes
+ logits = self.class_embed(intermediate_hidden_states[-1])
+
+ reference_before_sigmoid = inverse_sigmoid(reference_points)
+ bbox_with_refinement = self.bbox_predictor(intermediate_hidden_states)
+ bbox_with_refinement[..., : self.query_dim] += reference_before_sigmoid
+ outputs_coord = bbox_with_refinement.sigmoid()
+
+ pred_boxes = outputs_coord[-1]
+
+ loss, loss_dict, auxiliary_outputs = None, None, None
+ if labels is not None:
+ outputs_class = None
+ if self.config.auxiliary_loss:
+ outputs_class = self.class_embed(intermediate_hidden_states)
+ loss, loss_dict, auxiliary_outputs = self.loss_function(
+ logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord
+ )
+
+ if not return_dict:
+ if auxiliary_outputs is not None:
+ output = (logits, pred_boxes) + auxiliary_outputs + model_outputs
+ else:
+ output = (logits, pred_boxes) + model_outputs
+ # Since DabDetrObjectDetectionOutput doesn't have reference points + intermedieate_hidden_states we cut down.
+ return ((loss, loss_dict) + output) if loss is not None else output[:-2]
+
+ return DabDetrObjectDetectionOutput(
+ loss=loss,
+ loss_dict=loss_dict,
+ logits=logits,
+ pred_boxes=pred_boxes,
+ auxiliary_outputs=auxiliary_outputs,
+ last_hidden_state=model_outputs.last_hidden_state,
+ decoder_hidden_states=model_outputs.decoder_hidden_states if output_hidden_states else None,
+ decoder_attentions=model_outputs.decoder_attentions if output_attentions else None,
+ cross_attentions=model_outputs.cross_attentions if output_attentions else None,
+ encoder_last_hidden_state=model_outputs.encoder_last_hidden_state if output_hidden_states else None,
+ encoder_hidden_states=model_outputs.encoder_hidden_states if output_hidden_states else None,
+ encoder_attentions=model_outputs.encoder_attentions if output_attentions else None,
+ )
+
+
+__all__ = [
+ "DabDetrForObjectDetection",
+ "DabDetrModel",
+ "DabDetrPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/dac/__init__.py b/docs/transformers/build/lib/transformers/models/dac/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..40f84ccb59be91ad440ba413366dd726e555ca26
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dac/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_dac import *
+ from .feature_extraction_dac import *
+ from .modeling_dac import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/dac/configuration_dac.py b/docs/transformers/build/lib/transformers/models/dac/configuration_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbefd3ccb59d774f063d2bf85d637fa31896c92
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dac/configuration_dac.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dac model configuration"""
+
+import math
+
+import numpy as np
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DacConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of an [`DacModel`]. It is used to instantiate a
+ Dac model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the
+ [descript/dac_16khz](https://huggingface.co/descript/dac_16khz) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ encoder_hidden_size (`int`, *optional*, defaults to 64):
+ Intermediate representation dimension for the encoder.
+ downsampling_ratios (`List[int]`, *optional*, defaults to `[2, 4, 8, 8]`):
+ Ratios for downsampling in the encoder. These are used in reverse order for upsampling in the decoder.
+ decoder_hidden_size (`int`, *optional*, defaults to 1536):
+ Intermediate representation dimension for the decoder.
+ n_codebooks (`int`, *optional*, defaults to 9):
+ Number of codebooks in the VQVAE.
+ codebook_size (`int`, *optional*, defaults to 1024):
+ Number of discrete codes in each codebook.
+ codebook_dim (`int`, *optional*, defaults to 8):
+ Dimension of the codebook vectors. If not defined, uses `encoder_hidden_size`.
+ quantizer_dropout (`bool`, *optional*, defaults to 0):
+ Whether to apply dropout to the quantizer.
+ commitment_loss_weight (float, *optional*, defaults to 0.25):
+ Weight of the commitment loss term in the VQVAE loss function.
+ codebook_loss_weight (float, *optional*, defaults to 1.0):
+ Weight of the codebook loss term in the VQVAE loss function.
+ sampling_rate (`int`, *optional*, defaults to 16000):
+ The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+ Example:
+
+ ```python
+ >>> from transformers import DacModel, DacConfig
+
+ >>> # Initializing a "descript/dac_16khz" style configuration
+ >>> configuration = DacConfig()
+
+ >>> # Initializing a model (with random weights) from the "descript/dac_16khz" style configuration
+ >>> model = DacModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "dac"
+
+ def __init__(
+ self,
+ encoder_hidden_size=64,
+ downsampling_ratios=[2, 4, 8, 8],
+ decoder_hidden_size=1536,
+ n_codebooks=9,
+ codebook_size=1024,
+ codebook_dim=8,
+ quantizer_dropout=0,
+ commitment_loss_weight=0.25,
+ codebook_loss_weight=1.0,
+ sampling_rate=16000,
+ **kwargs,
+ ):
+ self.encoder_hidden_size = encoder_hidden_size
+ self.downsampling_ratios = downsampling_ratios
+ self.decoder_hidden_size = decoder_hidden_size
+ self.upsampling_ratios = downsampling_ratios[::-1]
+ self.n_codebooks = n_codebooks
+ self.codebook_size = codebook_size
+ self.codebook_dim = codebook_dim
+ self.quantizer_dropout = quantizer_dropout
+ self.sampling_rate = sampling_rate
+
+ self.hidden_size = encoder_hidden_size * (2 ** len(downsampling_ratios))
+
+ self.hop_length = int(np.prod(downsampling_ratios))
+ self.commitment_loss_weight = commitment_loss_weight
+ self.codebook_loss_weight = codebook_loss_weight
+
+ super().__init__(**kwargs)
+
+ @property
+ def frame_rate(self) -> int:
+ hop_length = np.prod(self.upsampling_ratios)
+ return math.ceil(self.sampling_rate / hop_length)
+
+
+__all__ = ["DacConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/dac/convert_dac_checkpoint.py b/docs/transformers/build/lib/transformers/models/dac/convert_dac_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1728a7da11a50e08625ff74d78b4c119d44601a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dac/convert_dac_checkpoint.py
@@ -0,0 +1,261 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import fnmatch
+import re
+
+import torch
+
+from transformers import (
+ DacConfig,
+ DacFeatureExtractor,
+ DacModel,
+ logging,
+)
+
+
+# checkpoints downloaded using:
+# pip install descript-audio-codec
+# python3 -m dac download # downloads the default 44kHz variant
+# python3 -m dac download --model_type 44khz # downloads the 44kHz variant
+# python3 -m dac download --model_type 24khz # downloads the 24kHz variant
+# python3 -m dac download --model_type 16khz # downloads the 16kHz variant
+# More informations: https://github.com/descriptinc/descript-audio-codec/tree/main
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.dac")
+
+
+def match_pattern(string, pattern):
+ # Split the pattern into parts
+ pattern_parts = pattern.split(".")
+ string_parts = string.split(".")
+
+ pattern_block_count = string_block_count = 0
+
+ for part in pattern_parts:
+ if part.startswith("block"):
+ pattern_block_count += 1
+
+ for part in string_parts:
+ if part.startswith("block"):
+ string_block_count += 1
+
+ return fnmatch.fnmatch(string, pattern) and string_block_count == pattern_block_count
+
+
+TOP_LEVEL_KEYS = []
+IGNORE_KEYS = []
+
+
+MAPPING_ENCODER = {
+ "encoder.block.0": ["encoder.conv1"],
+ "encoder.block.5": ["encoder.snake1"],
+ "encoder.block.6": ["encoder.conv2"],
+ "encoder.block.*.block.*.block.0".replace("*", r"\d+"): ["encoder.block", "res_unit", "snake1"],
+ "encoder.block.*.block.*.block.1".replace("*", r"\d+"): ["encoder.block", "res_unit", "conv1"],
+ "encoder.block.*.block.*.block.2".replace("*", r"\d+"): ["encoder.block", "res_unit", "snake2"],
+ "encoder.block.*.block.*.block.3".replace("*", r"\d+"): ["encoder.block", "res_unit", "conv2"],
+ "encoder.block.*.block.3".replace("*", r"\d+"): ["encoder.block", "snake1"],
+ "encoder.block.*.block.4".replace("*", r"\d+"): ["encoder.block", "conv1"],
+}
+
+MAPPING_QUANTIZER = {
+ "quantizer.quantizers.*": ["quantizer.quantizers.*"],
+}
+
+MAPPING_DECODER = {
+ "decoder.model.0": ["decoder.conv1"],
+ "decoder.model.5": ["decoder.snake1"],
+ "decoder.model.6": ["decoder.conv2"],
+ "decoder.model.*.block.0".replace("*", r"\d+"): ["decoder.block", "snake1"],
+ "decoder.model.*.block.1".replace("*", r"\d+"): ["decoder.block", "conv_t1"],
+ "decoder.model.*.block.*.block.0".replace("*", r"\d+"): ["decoder.block", "res_unit", "snake1"],
+ "decoder.model.*.block.*.block.1".replace("*", r"\d+"): ["decoder.block", "res_unit", "conv1"],
+ "decoder.model.*.block.*.block.2".replace("*", r"\d+"): ["decoder.block", "res_unit", "snake2"],
+ "decoder.model.*.block.*.block.3".replace("*", r"\d+"): ["decoder.block", "res_unit", "conv2"],
+}
+
+
+MAPPING = {
+ **MAPPING_ENCODER,
+ **MAPPING_QUANTIZER,
+ **MAPPING_DECODER,
+}
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+ for attribute in key.split("."):
+ hf_pointer = getattr(hf_pointer, attribute)
+
+ if weight_type is not None:
+ hf_shape = getattr(hf_pointer, weight_type).shape
+ else:
+ hf_shape = hf_pointer.shape
+
+ if hf_shape != value.shape:
+ raise ValueError(
+ f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+ f" {value.shape} for {full_name}"
+ )
+
+ if weight_type == "weight":
+ hf_pointer.weight.data = value
+ elif weight_type == "weight_g":
+ hf_pointer.weight_g.data = value
+ elif weight_type == "weight_v":
+ hf_pointer.weight_v.data = value
+ elif weight_type == "bias":
+ hf_pointer.bias.data = value
+ elif weight_type == "alpha":
+ hf_pointer.alpha.data = value
+ logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
+
+
+def should_ignore(name, ignore_keys):
+ for key in ignore_keys:
+ if key.endswith(".*"):
+ if name.startswith(key[:-1]):
+ return True
+ elif ".*." in key:
+ prefix, suffix = key.split(".*.")
+ if prefix in name and suffix in name:
+ return True
+ elif key in name:
+ return True
+ return False
+
+
+def recursively_load_weights(orig_dict, hf_model, model_name):
+ unused_weights = []
+
+ if model_name not in ["dac_16khz", "dac_24khz", "dac_44khz"]:
+ raise ValueError(f"Unsupported model: {model_name}")
+
+ for name, value in orig_dict.items():
+ is_used = False
+ for key, mapped_key in MAPPING.items():
+ regex = re.compile(key)
+ if regex.search(name):
+ if len(mapped_key) == 1:
+ if mapped_key[0][0] == "q":
+ mapped_key = ".".join(name.split(".")[:-1])
+ else:
+ mapped_key = mapped_key[0]
+ elif len(mapped_key) == 3:
+ integers = re.findall(r"\b\d+\b", name)
+ if mapped_key[0][0] == "d":
+ mapped_key = "{}.{}.{}{}.{}".format(
+ mapped_key[0],
+ str(int(integers[0]) - 1),
+ mapped_key[1],
+ str(int(integers[1]) - 1),
+ mapped_key[2],
+ )
+ else:
+ mapped_key = "{}.{}.{}{}.{}".format(
+ mapped_key[0],
+ str(int(integers[0]) - 1),
+ mapped_key[1],
+ str(int(integers[1]) + 1),
+ mapped_key[2],
+ )
+ elif len(mapped_key) == 2:
+ integers = re.findall(r"\b\d+\b", name)
+ mapped_key = "{}.{}.{}".format(mapped_key[0], str(int(integers[0]) - 1), mapped_key[1])
+
+ is_used = True
+ if "weight_g" in name:
+ weight_type = "weight_g"
+ elif "weight_v" in name:
+ weight_type = "weight_v"
+ elif "bias" in name:
+ weight_type = "bias"
+ elif "alpha" in name:
+ weight_type = "alpha"
+ elif "weight" in name:
+ weight_type = "weight"
+ set_recursively(hf_model, mapped_key, value, name, weight_type)
+
+ if not is_used:
+ unused_weights.append(name)
+
+ print(list(set(unused_weights)))
+
+ logger.warning(f"Unused weights: {unused_weights}")
+
+
+@torch.no_grad()
+def convert_checkpoint(
+ model_name,
+ checkpoint_path,
+ pytorch_dump_folder_path,
+ sample_rate=16000,
+ repo_id=None,
+):
+ model_dict = torch.load(checkpoint_path, "cpu", weights_only=True)
+
+ config = DacConfig()
+
+ metadata = model_dict["metadata"]["kwargs"]
+ config.encoder_hidden_size = metadata["encoder_dim"]
+ config.downsampling_ratios = metadata["encoder_rates"]
+ config.codebook_size = metadata["codebook_size"]
+ config.n_codebooks = metadata["n_codebooks"]
+ config.codebook_dim = metadata["codebook_dim"]
+ config.decoder_hidden_size = metadata["decoder_dim"]
+ config.upsampling_ratios = metadata["decoder_rates"]
+ config.quantizer_dropout = float(metadata["quantizer_dropout"])
+ config.sampling_rate = sample_rate
+
+ model = DacModel(config)
+ feature_extractor = DacFeatureExtractor()
+ feature_extractor.sampling_rate = sample_rate
+
+ original_checkpoint = model_dict["state_dict"]
+
+ model.apply_weight_norm()
+ recursively_load_weights(original_checkpoint, model, model_name)
+ model.remove_weight_norm()
+
+ model.save_pretrained(pytorch_dump_folder_path)
+
+ if repo_id:
+ print("Pushing to the hub...")
+ feature_extractor.push_to_hub(repo_id)
+ model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--model",
+ default="dac_44khz",
+ type=str,
+ help="The model to convert. Should be one of 'dac_16khz', 'dac_24khz', 'dac_44khz'.",
+ )
+ parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+ parser.add_argument(
+ "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+ )
+ parser.add_argument(
+ "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+ )
+ parser.add_argument("--sample_rate", default=None, type=str, help="Sample rate used by DacFeatureExtractor")
+ args = parser.parse_args()
+
+ convert_checkpoint(
+ args.model, args.checkpoint_path, args.pytorch_dump_folder_path, args.sample_rate, args.push_to_hub
+ )
diff --git a/docs/transformers/build/lib/transformers/models/dac/feature_extraction_dac.py b/docs/transformers/build/lib/transformers/models/dac/feature_extraction_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e5bfadb6141e92c7ee78155fe816a298e725694
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dac/feature_extraction_dac.py
@@ -0,0 +1,173 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for DAC"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DacFeatureExtractor(SequenceFeatureExtractor):
+ r"""
+ Constructs an Dac feature extractor.
+
+ This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+ most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+ Args:
+ feature_size (`int`, *optional*, defaults to 1):
+ The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+ sampling_rate (`int`, *optional*, defaults to 16000):
+ The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
+ padding_value (`float`, *optional*, defaults to 0.0):
+ The value that is used for padding.
+ hop_length (`int`, *optional*, defaults to 512):
+ Overlap length between successive windows.
+ """
+
+ model_input_names = ["input_values", "n_quantizers"]
+
+ def __init__(
+ self,
+ feature_size: int = 1,
+ sampling_rate: int = 16000,
+ padding_value: float = 0.0,
+ hop_length: int = 512,
+ **kwargs,
+ ):
+ super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+ self.hop_length = hop_length
+
+ def __call__(
+ self,
+ raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+ padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+ truncation: Optional[bool] = False,
+ max_length: Optional[int] = None,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ sampling_rate: Optional[int] = None,
+ ) -> BatchFeature:
+ """
+ Main method to featurize and prepare for the model one or several sequence(s).
+
+ Args:
+ raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+ The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+ values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+ `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+ (`feature_size = 2`).
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
+ index) among:
+
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+ sequence if provided).
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+ acceptable input length for the model if that argument is not provided.
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+ lengths).
+ truncation (`bool`, *optional*, defaults to `False`):
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+ max_length (`int`, *optional*):
+ Maximum length of the returned list and optionally padding length (see above).
+ return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
+ If set, will return tensors instead of list of python integers. Acceptable values are:
+
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return Numpy `np.ndarray` objects.
+ sampling_rate (`int`, *optional*):
+ The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+ `sampling_rate` at the forward call to prevent silent errors.
+ """
+ if sampling_rate is not None:
+ if sampling_rate != self.sampling_rate:
+ raise ValueError(
+ f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+ f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+ f" {self.sampling_rate} and not {sampling_rate}."
+ )
+ else:
+ logger.warning(
+ f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+ "Failing to do so can result in silent errors that might be hard to debug."
+ )
+
+ if padding and truncation:
+ raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+ elif padding is None:
+ # by default let's pad the inputs
+ padding = True
+
+ is_batched = bool(
+ isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+ )
+
+ if is_batched:
+ raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+ elif not is_batched and not isinstance(raw_audio, np.ndarray):
+ raw_audio = np.asarray(raw_audio, dtype=np.float32)
+ elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+ raw_audio = raw_audio.astype(np.float32)
+
+ # always return batch
+ if not is_batched:
+ raw_audio = [np.asarray(raw_audio).T]
+
+ # verify inputs are valid
+ for idx, example in enumerate(raw_audio):
+ if example.ndim > 2:
+ raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+ if self.feature_size == 1 and example.ndim != 1:
+ raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+ if self.feature_size == 2:
+ raise ValueError("Stereo audio isn't supported for now")
+
+ input_values = BatchFeature({"input_values": raw_audio})
+
+ # normal padding on batch
+ padded_inputs = self.pad(
+ input_values,
+ max_length=max_length,
+ truncation=truncation,
+ padding=padding,
+ return_attention_mask=False,
+ pad_to_multiple_of=self.hop_length,
+ )
+
+ if padding:
+ padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :]
+
+ input_values = []
+ for example in padded_inputs.pop("input_values"):
+ if self.feature_size == 1:
+ example = example[..., None]
+ input_values.append(example.T)
+
+ padded_inputs["input_values"] = input_values
+ if return_tensors is not None:
+ padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+ return padded_inputs
+
+
+__all__ = ["DacFeatureExtractor"]
diff --git a/docs/transformers/build/lib/transformers/models/dac/modeling_dac.py b/docs/transformers/build/lib/transformers/models/dac/modeling_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..47a45f44e9199eca08f49c81df56e5f00c0b969f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dac/modeling_dac.py
@@ -0,0 +1,724 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers DAC model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ replace_return_docstrings,
+)
+from .configuration_dac import DacConfig
+
+
+# General docstring
+_CONFIG_FOR_DOC = "DacConfig"
+
+
+@dataclass
+class DacOutput(ModelOutput):
+ """
+ Args:
+ loss (`torch.Tensor`):
+ Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
+ audio_values (`torch.Tensor` of shape `(batch_size, input_length)`):
+ Reconstructed audio data.
+ quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+ Quantized continuous representation of input.
+ audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`):
+ Codebook indices for each codebook (quantized discrete representation of input).
+ projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
+ Projected latents (continuous representation of input before quantization).
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ audio_values: Optional[torch.FloatTensor] = None
+ quantized_representation: Optional[torch.FloatTensor] = None
+ audio_codes: Optional[torch.LongTensor] = None
+ projected_latents: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class DacEncoderOutput(ModelOutput):
+ """
+ Args:
+ loss (`torch.Tensor`):
+ Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
+ quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*):
+ Quantized continuous representation of input.
+ audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
+ Codebook indices for each codebook (quantized discrete representation of input).
+ projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*):
+ Projected latents (continuous representation of input before quantization).
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ quantized_representation: Optional[torch.FloatTensor] = None
+ audio_codes: Optional[torch.FloatTensor] = None
+ projected_latents: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+# Copied from transformers.models.encodec.modeling_encodec.EncodecDecoderOutput with Encodec->Dac, segment_length->input_length
+class DacDecoderOutput(ModelOutput):
+ """
+ Args:
+ audio_values (`torch.FloatTensor` of shape `(batch_size, input_length)`, *optional*):
+ Decoded audio values, obtained using the decoder part of Dac.
+ """
+
+ audio_values: Optional[torch.FloatTensor] = None
+
+
+class Snake1d(nn.Module):
+ """
+ A 1-dimensional Snake activation function module.
+ """
+
+ def __init__(self, hidden_dim):
+ super().__init__()
+ self.alpha = nn.Parameter(torch.ones(1, hidden_dim, 1))
+
+ def forward(self, hidden_states):
+ shape = hidden_states.shape
+ hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
+ hidden_states = hidden_states + (self.alpha + 1e-9).reciprocal() * torch.sin(self.alpha * hidden_states).pow(2)
+ hidden_states = hidden_states.reshape(shape)
+ return hidden_states
+
+
+class DacVectorQuantize(nn.Module):
+ """
+ Implementation of VQ similar to Karpathy's repo (https://github.com/karpathy/deep-vector-quantization)
+
+ Additionally uses following tricks from improved VQGAN
+ (https://arxiv.org/pdf/2110.04627.pdf):
+ 1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+ for improved codebook usage
+ 2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+ improves training stability
+ """
+
+ def __init__(self, config: DacConfig):
+ super().__init__()
+
+ self.in_proj = nn.Conv1d(config.hidden_size, config.codebook_dim, kernel_size=1)
+ self.out_proj = nn.Conv1d(config.codebook_dim, config.hidden_size, kernel_size=1)
+ self.codebook = nn.Embedding(config.codebook_size, config.codebook_dim)
+
+ def forward(self, hidden_state):
+ """
+ Quantizes the input tensor using a fixed codebook and returns the corresponding codebook vectors.
+
+ Args:
+ hidden_state (`torch.FloatTensor` of shape `(batch_size, dimension, time_steps)`):
+ Input tensor.
+
+ Returns:
+ quantized_representation (`torch.Tensor`of shape `(batch_size, dimension, time_steps)`):
+ Quantized continuous representation of input.
+ commitment_loss (`torch.FloatTensor`of shape `(1)`):
+ Commitment loss to train encoder to predict vectors closer to codebook entries.
+ codebook_loss (`torch.FloatTensor`of shape `(1)`):
+ Codebook loss to update the codebook.
+ audio_codes (`torch.LongTensor` of shape `(batch_size, time_steps)`):
+ Codebook indices for each codebook, quantized discrete representation of input.
+ projected_latents (torch.FloatTensor of shape `(batch_size, num_codebooks * dimension, time_steps)`):
+ Projected latents (continuous representation of input before quantization).
+ """
+
+ projected_latents = self.in_proj(hidden_state)
+ quantized_representation, audio_codes = self.decode_latents(projected_latents)
+
+ commitment_loss = F.mse_loss(projected_latents, quantized_representation.detach(), reduction="mean")
+ codebook_loss = F.mse_loss(quantized_representation, projected_latents.detach(), reduction="mean")
+ # noop in forward pass, straight-through gradient estimator in backward pass
+ quantized_representation = projected_latents + (quantized_representation - projected_latents).detach()
+ quantized_representation = self.out_proj(quantized_representation)
+
+ return quantized_representation, commitment_loss, codebook_loss, audio_codes, projected_latents
+
+ def decode_latents(self, hidden_states):
+ batch_size, hidden_dim, sequence_length = hidden_states.shape
+ encodings = hidden_states.permute(0, 2, 1).reshape(batch_size * sequence_length, hidden_dim)
+ codebook = self.codebook.weight # codebook: (N x D)
+
+ # L2 normalize encodings and codebook (ViT-VQGAN)
+ encodings = F.normalize(encodings)
+ codebook = F.normalize(codebook)
+
+ # Compute euclidean distance with codebook
+ l2_norm = encodings.pow(2).sum(1, keepdim=True)
+ dist = -(l2_norm - 2 * encodings @ codebook.t()) + codebook.pow(2).sum(1, keepdim=True).t()
+
+ indices = dist.max(1)[1]
+ indices = indices.reshape(hidden_states.size(0), -1)
+ quantized_representation = self.codebook(indices).transpose(1, 2)
+ return quantized_representation, indices
+
+
+class DacResidualUnit(nn.Module):
+ """
+ A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
+ """
+
+ def __init__(self, dimension: int = 16, dilation: int = 1):
+ super().__init__()
+ pad = ((7 - 1) * dilation) // 2
+
+ self.snake1 = Snake1d(dimension)
+ self.conv1 = nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad)
+ self.snake2 = Snake1d(dimension)
+ self.conv2 = nn.Conv1d(dimension, dimension, kernel_size=1)
+
+ def forward(self, hidden_state):
+ """
+ Forward pass through the residual unit.
+
+ Args:
+ hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
+ Input tensor .
+
+ Returns:
+ output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
+ Input tensor after passing through the residual unit.
+ """
+ output_tensor = hidden_state
+ output_tensor = self.conv1(self.snake1(output_tensor))
+ output_tensor = self.conv2(self.snake2(output_tensor))
+
+ padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2
+ if padding > 0:
+ hidden_state = hidden_state[..., padding:-padding]
+ output_tensor = hidden_state + output_tensor
+ return output_tensor
+
+
+class DacEncoderBlock(nn.Module):
+ """Encoder block used in DAC encoder."""
+
+ def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
+ super().__init__()
+
+ dimension = config.encoder_hidden_size * 2**stride_index
+ self.res_unit1 = DacResidualUnit(dimension // 2, dilation=1)
+ self.res_unit2 = DacResidualUnit(dimension // 2, dilation=3)
+ self.res_unit3 = DacResidualUnit(dimension // 2, dilation=9)
+ self.snake1 = Snake1d(dimension // 2)
+ self.conv1 = nn.Conv1d(
+ dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2)
+ )
+
+ def forward(self, hidden_state):
+ hidden_state = self.res_unit1(hidden_state)
+ hidden_state = self.res_unit2(hidden_state)
+ hidden_state = self.snake1(self.res_unit3(hidden_state))
+ hidden_state = self.conv1(hidden_state)
+
+ return hidden_state
+
+
+class DacDecoderBlock(nn.Module):
+ """Decoder block used in DAC decoder."""
+
+ def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
+ super().__init__()
+
+ input_dim = config.decoder_hidden_size // 2**stride_index
+ output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1)
+ self.snake1 = Snake1d(input_dim)
+ self.conv_t1 = nn.ConvTranspose1d(
+ input_dim,
+ output_dim,
+ kernel_size=2 * stride,
+ stride=stride,
+ padding=math.ceil(stride / 2),
+ )
+
+ self.res_unit1 = DacResidualUnit(output_dim, dilation=1)
+ self.res_unit2 = DacResidualUnit(output_dim, dilation=3)
+ self.res_unit3 = DacResidualUnit(output_dim, dilation=9)
+
+ def forward(self, hidden_state):
+ hidden_state = self.snake1(hidden_state)
+ hidden_state = self.conv_t1(hidden_state)
+ hidden_state = self.res_unit1(hidden_state)
+ hidden_state = self.res_unit2(hidden_state)
+ hidden_state = self.res_unit3(hidden_state)
+
+ return hidden_state
+
+
+class DacResidualVectorQuantize(nn.Module):
+ """
+ ResidualVectorQuantize block - Introduced in SoundStream: An end2end neural audio codec (https://arxiv.org/abs/2107.03312)
+ """
+
+ def __init__(self, config: DacConfig):
+ super().__init__()
+
+ n_codebooks = config.n_codebooks
+ quantizer_dropout = config.quantizer_dropout
+
+ self.n_codebooks = n_codebooks
+
+ self.quantizers = nn.ModuleList([DacVectorQuantize(config) for i in range(config.n_codebooks)])
+ self.quantizer_dropout = quantizer_dropout
+
+ def forward(self, hidden_state, n_quantizers: Optional[int] = None):
+ """
+ Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors.
+ Args:
+ hidden_state (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+ Input tensor to be quantized.
+ n_quantizers (`int`, *optional*):
+ Number of quantizers to use. If specified and `self.quantizer_dropout` is True,
+ this argument is ignored during training, and a random number of quantizers is used.
+
+ Returns:
+ quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+ Quantized continuous representation of input.
+ audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
+ Codebook indices for each codebook (quantized discrete representation of input).
+ projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
+ Projected latents (continuous representation of input before quantization).
+ commitment_loss (`torch.Tensor` of shape `(1)`):
+ Commitment loss to train the encoder to predict vectors closer to codebook entries.
+ codebook_loss (`torch.Tensor` of shape `(1)`):
+ Codebook loss to update the codebook.
+ """
+
+ quantized_representation = 0
+ residual = hidden_state
+ commitment_loss = 0
+ codebook_loss = 0
+
+ audio_codes = []
+ projected_latents = []
+
+ n_quantizers = n_quantizers if n_quantizers is not None else self.n_codebooks
+ if self.training:
+ n_quantizers = torch.ones((hidden_state.shape[0],)) * self.n_codebooks + 1
+ dropout = torch.randint(1, self.n_codebooks + 1, (hidden_state.shape[0],))
+ n_dropout = int(hidden_state.shape[0] * self.quantizer_dropout)
+ n_quantizers[:n_dropout] = dropout[:n_dropout]
+ n_quantizers = n_quantizers.to(hidden_state.device)
+
+ for i, quantizer in enumerate(self.quantizers):
+ if self.training is False and i >= n_quantizers:
+ break
+
+ quantized_representation_i, commitment_loss_i, codebook_loss_i, indices_i, projected_latents_i = quantizer(
+ residual
+ )
+
+ # Create mask to apply quantizer dropout
+ mask = torch.full((hidden_state.shape[0],), fill_value=i, device=hidden_state.device) < n_quantizers
+ quantized_representation = quantized_representation + quantized_representation_i * mask[:, None, None]
+ residual = residual - quantized_representation_i
+
+ # Sum losses
+ commitment_loss += commitment_loss_i * mask
+ codebook_loss += codebook_loss_i * mask
+
+ audio_codes.append(indices_i)
+ projected_latents.append(projected_latents_i)
+
+ audio_codes = torch.stack(audio_codes, dim=1)
+ projected_latents = torch.cat(projected_latents, dim=1)
+
+ return quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss
+
+ def from_codes(self, audio_codes: torch.Tensor):
+ """
+ Reconstructs the continuous representation from quantized codes.
+
+ Args:
+ audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
+ Quantized discrete representation of input.
+
+ Returns:
+ quantized_representation (`torch.Tensor`):
+ Quantized continuous representation of input.
+ projected_latents (`torch.Tensor`):
+ List of projected latents (continuous representations of input before quantization)
+ for each codebook.
+ audio_codes (`torch.Tensor`):
+ Codebook indices for each codebook.
+ """
+ quantized_representation = 0.0
+ projected_latents = []
+ n_codebooks = audio_codes.shape[1]
+ for i in range(n_codebooks):
+ projected_latents_i = self.quantizers[i].codebook(audio_codes[:, i, :]).transpose(1, 2)
+ projected_latents.append(projected_latents_i)
+ quantized_representation += self.quantizers[i].out_proj(projected_latents_i)
+ return quantized_representation, torch.cat(projected_latents, dim=1), audio_codes
+
+ def from_latents(self, latents: torch.Tensor):
+ """Reconstructs the quantized representation from unquantized latents.
+
+ Args:
+ latents (`torch.Tensor` of shape `(batch_size, total_latent_dimension, time_steps)`):
+ Continuous representation of input after projection.
+
+ Returns:
+ quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+ Quantized representation of the full-projected space.
+ quantized_latents (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+ Quantized representation of the latent space (continuous representation before quantization).
+ """
+ quantized_representation = 0
+ quantized_latents = []
+ codes = []
+ codebook_dims_tensor = torch.tensor([0] + [q.codebook_dim for q in self.quantizers])
+ dims = torch.cumsum(codebook_dims_tensor, dim=0)
+
+ n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[0]
+ for i in range(n_codebooks):
+ hidden_dim_j, hidden_dim_k = dims[i], dims[i + 1]
+ quantized_latents_i, codes_i = self.quantizers[i].decode_latents(latents[:, hidden_dim_j:hidden_dim_k, :])
+ quantized_latents.append(quantized_latents_i)
+ codes.append(codes_i)
+
+ quantized_representation_i = self.quantizers[i].out_proj(quantized_latents_i)
+ quantized_representation = quantized_representation + quantized_representation_i
+
+ return quantized_representation, torch.cat(quantized_latents, dim=1)
+
+
+class DacDecoder(nn.Module):
+ """DAC Decoder"""
+
+ def __init__(self, config: DacConfig):
+ super().__init__()
+
+ input_channel = config.hidden_size
+ channels = config.decoder_hidden_size
+ strides = config.upsampling_ratios
+
+ # Add first conv layer
+ self.conv1 = nn.Conv1d(input_channel, channels, kernel_size=7, padding=3)
+
+ # Add upsampling + MRF blocks
+ block = []
+ for stride_index, stride in enumerate(strides):
+ block += [DacDecoderBlock(config, stride, stride_index)]
+
+ self.block = nn.ModuleList(block)
+ output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1)
+ self.snake1 = Snake1d(output_dim)
+ self.conv2 = nn.Conv1d(output_dim, 1, kernel_size=7, padding=3)
+ self.tanh = nn.Tanh()
+
+ def forward(self, hidden_state):
+ hidden_state = self.conv1(hidden_state)
+
+ for layer in self.block:
+ hidden_state = layer(hidden_state)
+
+ hidden_state = self.snake1(hidden_state)
+ hidden_state = self.conv2(hidden_state)
+ hidden_state = self.tanh(hidden_state)
+
+ return hidden_state
+
+
+class DacEncoder(nn.Module):
+ """DAC Encoder"""
+
+ def __init__(self, config: DacConfig):
+ super().__init__()
+
+ strides = config.downsampling_ratios
+ # Create first convolution
+ self.conv1 = nn.Conv1d(1, config.encoder_hidden_size, kernel_size=7, padding=3)
+
+ self.block = []
+ # Create EncoderBlocks that double channels as they downsample by `stride`
+ for stride_index, stride in enumerate(strides):
+ stride_index = stride_index + 1
+ self.block += [DacEncoderBlock(config, stride=stride, stride_index=stride_index)]
+
+ self.block = nn.ModuleList(self.block)
+ d_model = config.encoder_hidden_size * 2**stride_index
+ self.snake1 = Snake1d(d_model)
+ self.conv2 = nn.Conv1d(d_model, config.hidden_size, kernel_size=3, padding=1)
+
+ def forward(self, hidden_state):
+ hidden_state = self.conv1(hidden_state)
+
+ for module in self.block:
+ hidden_state = module(hidden_state)
+
+ hidden_state = self.snake1(hidden_state)
+ hidden_state = self.conv2(hidden_state)
+
+ return hidden_state
+
+
+class DacPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
+ """
+
+ config_class = DacConfig
+ base_model_prefix = "dac"
+ main_input_name = "input_values"
+
+ def _init_weights(self, module):
+ if isinstance(module, nn.Conv1d):
+ nn.init.trunc_normal_(module.weight, std=0.02)
+ nn.init.constant_(module.bias, 0)
+
+ def apply_weight_norm(self):
+ weight_norm = nn.utils.weight_norm
+ if hasattr(nn.utils.parametrizations, "weight_norm"):
+ weight_norm = nn.utils.parametrizations.weight_norm
+
+ for layer in self.quantizer.quantizers:
+ weight_norm(layer.in_proj)
+ weight_norm(layer.out_proj)
+
+ weight_norm(self.encoder.conv1)
+ weight_norm(self.encoder.conv2)
+
+ for layer in self.encoder.block:
+ weight_norm(layer.conv1)
+ weight_norm(layer.res_unit1.conv1)
+ weight_norm(layer.res_unit1.conv2)
+ weight_norm(layer.res_unit2.conv1)
+ weight_norm(layer.res_unit2.conv2)
+ weight_norm(layer.res_unit3.conv1)
+ weight_norm(layer.res_unit3.conv2)
+
+ weight_norm(self.decoder.conv1)
+ weight_norm(self.decoder.conv2)
+
+ for layer in self.decoder.block:
+ weight_norm(layer.conv_t1)
+ weight_norm(layer.res_unit1.conv1)
+ weight_norm(layer.res_unit1.conv2)
+ weight_norm(layer.res_unit2.conv1)
+ weight_norm(layer.res_unit2.conv2)
+ weight_norm(layer.res_unit3.conv1)
+ weight_norm(layer.res_unit3.conv2)
+
+ def remove_weight_norm(self):
+ for layer in self.quantizer.quantizers:
+ nn.utils.remove_weight_norm(layer.in_proj)
+ nn.utils.remove_weight_norm(layer.out_proj)
+
+ nn.utils.remove_weight_norm(self.encoder.conv1)
+ nn.utils.remove_weight_norm(self.encoder.conv2)
+
+ for layer in self.encoder.block:
+ nn.utils.remove_weight_norm(layer.conv1)
+ nn.utils.remove_weight_norm(layer.res_unit1.conv1)
+ nn.utils.remove_weight_norm(layer.res_unit1.conv2)
+ nn.utils.remove_weight_norm(layer.res_unit2.conv1)
+ nn.utils.remove_weight_norm(layer.res_unit2.conv2)
+ nn.utils.remove_weight_norm(layer.res_unit3.conv1)
+ nn.utils.remove_weight_norm(layer.res_unit3.conv2)
+
+ nn.utils.remove_weight_norm(self.decoder.conv1)
+ nn.utils.remove_weight_norm(self.decoder.conv2)
+
+ for layer in self.decoder.block:
+ nn.utils.remove_weight_norm(layer.conv_t1)
+ nn.utils.remove_weight_norm(layer.res_unit1.conv1)
+ nn.utils.remove_weight_norm(layer.res_unit1.conv2)
+ nn.utils.remove_weight_norm(layer.res_unit2.conv1)
+ nn.utils.remove_weight_norm(layer.res_unit2.conv2)
+ nn.utils.remove_weight_norm(layer.res_unit3.conv1)
+ nn.utils.remove_weight_norm(layer.res_unit3.conv2)
+
+
+DAC_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`DacConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DAC_INPUTS_DOCSTRING = r"""
+ Args:
+ input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`).
+ Audio data to encode,
+ n_quantizers (`int`, *optional*):
+ Number of quantizers to use. If `None`, all quantizers are used. Default is `None`.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The DAC (Descript Audio Codec) model.",
+ DAC_START_DOCSTRING,
+)
+class DacModel(DacPreTrainedModel):
+ def __init__(self, config: DacConfig):
+ super().__init__(config)
+ self.config = config
+
+ self.encoder = DacEncoder(config)
+ self.decoder = DacDecoder(config)
+
+ self.quantizer = DacResidualVectorQuantize(config)
+
+ self.bits_per_codebook = int(math.log2(self.config.codebook_size))
+ if 2**self.bits_per_codebook != self.config.codebook_size:
+ raise ValueError("The codebook_size must be a power of 2.")
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @replace_return_docstrings(output_type=DacEncoderOutput, config_class=_CONFIG_FOR_DOC)
+ def encode(
+ self,
+ input_values: torch.Tensor,
+ n_quantizers: Optional[int] = None,
+ return_dict: Optional[bool] = None,
+ ):
+ """
+ Encode given audio data and return quantized latent codes
+
+ Args:
+ input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`):
+ Input audio data to encode,
+ n_quantizers (int, *optional*):
+ Number of quantizers to use. If None, all quantizers are used. Default is None.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ Returns:
+
+ """
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ quantized_representation = self.encoder(input_values)
+ quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss = self.quantizer(
+ quantized_representation, n_quantizers
+ )
+
+ loss = self.config.commitment_loss_weight * commitment_loss + self.config.codebook_loss_weight * codebook_loss
+
+ if not return_dict:
+ return (loss, quantized_representation, audio_codes, projected_latents)
+
+ return DacEncoderOutput(loss, quantized_representation, audio_codes, projected_latents)
+
+ @replace_return_docstrings(output_type=DacDecoderOutput, config_class=_CONFIG_FOR_DOC)
+ def decode(
+ self,
+ quantized_representation: Optional[torch.Tensor] = None,
+ audio_codes: Optional[torch.Tensor] = None,
+ return_dict: Optional[bool] = None,
+ ):
+ """Decode given latent codes and return audio data
+
+ Args:
+ quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*):
+ Quantized continuous representation of input.
+ audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
+ The codebook indices for each codebook, representing the quantized discrete
+ representation of the input. This parameter should be provided if you want
+ to decode directly from the audio codes (it will overwrite quantized_representation).
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+ Returns:
+
+ """
+
+ if quantized_representation is None and audio_codes is None:
+ raise ValueError("Either `quantized_representation` or `audio_codes` must be provided.")
+
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ if audio_codes is not None:
+ quantized_representation = self.quantizer.from_codes(audio_codes)[0]
+
+ audio_values = self.decoder(quantized_representation).squeeze(1)
+
+ if not return_dict:
+ return (audio_values,)
+
+ return DacDecoderOutput(audio_values)
+
+ @add_start_docstrings_to_model_forward(DAC_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=DacOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_values: torch.Tensor,
+ n_quantizers: Optional[int] = None,
+ return_dict: Optional[bool] = None,
+ ):
+ """
+ Returns:
+ Examples:
+
+ ```python
+ >>> from datasets import load_dataset, Audio
+ >>> from transformers import DacModel, AutoProcessor
+ >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+ >>> model = DacModel.from_pretrained("descript/dac_16khz")
+ >>> processor = AutoProcessor.from_pretrained("descript/dac_16khz")
+ >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+ >>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+ >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")
+
+ >>> encoder_outputs = model.encode(inputs["input_values"])
+ >>> # Get the intermediate audio codes
+ >>> audio_codes = encoder_outputs.audio_codes
+ >>> # Reconstruct the audio from its quantized representation
+ >>> audio_values = model.decode(encoder_outputs.quantized_representation)
+ >>> # or the equivalent with a forward pass
+ >>> audio_values = model(inputs["input_values"]).audio_values
+ ```"""
+
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+ length = input_values.shape[-1]
+ loss, quantized_representation, audio_codes, projected_latents = self.encode(
+ input_values, n_quantizers, return_dict=False
+ )
+ audio_values = self.decode(quantized_representation, return_dict=False)[0][..., :length]
+
+ if not return_dict:
+ return (loss, audio_values, quantized_representation, audio_codes, projected_latents)
+
+ return DacOutput(loss, audio_values, quantized_representation, audio_codes, projected_latents)
+
+
+__all__ = ["DacModel", "DacPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/__init__.py b/docs/transformers/build/lib/transformers/models/data2vec/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7000ac3d353bf4eba157b07e350b9ac5f7552a98
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_data2vec_audio import *
+ from .configuration_data2vec_text import *
+ from .configuration_data2vec_vision import *
+ from .modeling_data2vec_audio import *
+ from .modeling_data2vec_text import *
+ from .modeling_data2vec_vision import *
+ from .modeling_tf_data2vec_vision import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/configuration_data2vec_audio.py b/docs/transformers/build/lib/transformers/models/data2vec/configuration_data2vec_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..8066829027fb6e62e1f3bf4e15f229da81e92106
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -0,0 +1,288 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data2VecText configuration"""
+
+import math
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Data2VecAudioConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
+ an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
+ [facebook/data2vec-audio-base-960h](https://huggingface.co/facebook/data2vec-audio-base-960h) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 32):
+ Vocabulary size of the Data2VecAudio model. Defines the number of different tokens that can be represented
+ by the `inputs_ids` passed when calling [`Data2VecAudioModel`] or [`TFData2VecAudioModel`]. Vocabulary size
+ of the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the
+ forward method of [`Data2VecAudioModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout (`float`, *optional*, defaults to 0.1):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ activation_dropout (`float`, *optional*, defaults to 0.1):
+ The dropout ratio for activations inside the fully connected layer.
+ attention_dropout (`float`, *optional*, defaults to 0.1):
+ The dropout ratio for the attention probabilities.
+ final_dropout (`float`, *optional*, defaults to 0.1):
+ The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
+ layerdrop (`float`, *optional*, defaults to 0.1):
+ The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+ details.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the layer normalization layers.
+ feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout probability for output of the feature encoder.
+ feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+ extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+ A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+ feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+ conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+ A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+ of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+ conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+ A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+ length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+ *conv_dim*.
+ conv_bias (`bool`, *optional*, defaults to `False`):
+ Whether the 1D convolutional layers have a bias.
+ num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+ Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+ embeddings layer.
+ num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+ Number of groups of 1D convolutional positional embeddings layer.
+ mask_time_prob (`float`, *optional*, defaults to 0.05):
+ Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+ procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+ reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+ masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+ mask_time_length (`int`, *optional*, defaults to 10):
+ Length of vector span along the time axis.
+ mask_time_min_masks (`int`, *optional*, defaults to 2),:
+ The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+ irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+ mask_time_min_masks''
+ mask_feature_prob (`float`, *optional*, defaults to 0.0):
+ Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+ masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+ the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+ span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+ may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+ True`.
+ mask_feature_length (`int`, *optional*, defaults to 10):
+ Length of vector span along the feature axis.
+ mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+ The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+ step, irrespectively of `mask_feature_prob`. Only relevant if
+ ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+ ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+ Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+ instance of [`Data2VecAudioForCTC`].
+ ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+ Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+ occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+ of [`Data2VecAudioForCTC`].
+ use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+ Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+ instance of [`Data2VecAudioForSequenceClassification`].
+ classifier_proj_size (`int`, *optional*, defaults to 256):
+ Dimensionality of the projection before token mean-pooling for classification.
+ tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+ A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+ module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+ tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+ A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+ *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+ tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+ A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+ *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+ xvector_output_dim (`int`, *optional*, defaults to 512):
+ Dimensionality of the *XVector* embedding vectors.
+ add_adapter (`bool`, *optional*, defaults to `False`):
+ Whether a convolutional network should be stacked on top of the Data2VecAudio Encoder. Can be very useful
+ for warm-starting Data2VecAudio for SpeechEncoderDecoder models.
+ adapter_kernel_size (`int`, *optional*, defaults to 3):
+ Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+ adapter_stride (`int`, *optional*, defaults to 2):
+ Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+ num_adapter_layers (`int`, *optional*, defaults to 3):
+ Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+ True`.
+ output_hidden_size (`int`, *optional*):
+ Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+ if `add_adapter is True`.
+
+ Example:
+
+ ```python
+ >>> from transformers import Data2VecAudioConfig, Data2VecAudioModel
+
+ >>> # Initializing a Data2VecAudio facebook/data2vec-audio-base-960h style configuration
+ >>> configuration = Data2VecAudioConfig()
+
+ >>> # Initializing a model (with random weights) from the facebook/data2vec-audio-base-960h style configuration
+ >>> model = Data2VecAudioModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "data2vec-audio"
+
+ def __init__(
+ self,
+ vocab_size=32,
+ hidden_size=768,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ intermediate_size=3072,
+ hidden_act="gelu",
+ hidden_dropout=0.1,
+ activation_dropout=0.1,
+ attention_dropout=0.1,
+ feat_proj_dropout=0.0,
+ final_dropout=0.1,
+ layerdrop=0.1,
+ initializer_range=0.02,
+ layer_norm_eps=1e-5,
+ feat_extract_activation="gelu",
+ conv_dim=(512, 512, 512, 512, 512, 512, 512),
+ conv_stride=(5, 2, 2, 2, 2, 2, 2),
+ conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+ conv_bias=False,
+ num_conv_pos_embedding_groups=16,
+ conv_pos_kernel_size=19,
+ num_conv_pos_embeddings=5,
+ mask_time_prob=0.05,
+ mask_time_length=10,
+ mask_time_min_masks=2,
+ mask_feature_prob=0.0,
+ mask_feature_length=10,
+ mask_feature_min_masks=0,
+ ctc_loss_reduction="sum",
+ ctc_zero_infinity=False,
+ use_weighted_layer_sum=False,
+ classifier_proj_size=256,
+ tdnn_dim=(512, 512, 512, 512, 1500),
+ tdnn_kernel=(5, 3, 3, 1, 1),
+ tdnn_dilation=(1, 2, 3, 1, 1),
+ xvector_output_dim=512,
+ pad_token_id=0,
+ bos_token_id=1,
+ eos_token_id=2,
+ add_adapter=False,
+ adapter_kernel_size=3,
+ adapter_stride=2,
+ num_adapter_layers=3,
+ output_hidden_size=None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+ self.hidden_size = hidden_size
+ self.feat_extract_activation = feat_extract_activation
+ self.conv_dim = list(conv_dim)
+ self.conv_stride = list(conv_stride)
+ self.conv_kernel = list(conv_kernel)
+ self.conv_bias = conv_bias
+ self.num_conv_pos_embeddings = num_conv_pos_embeddings
+ self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+ self.conv_pos_kernel_size = conv_pos_kernel_size
+ self.num_feat_extract_layers = len(self.conv_dim)
+ self.num_hidden_layers = num_hidden_layers
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.num_attention_heads = num_attention_heads
+ self.hidden_dropout = hidden_dropout
+ self.attention_dropout = attention_dropout
+ self.activation_dropout = activation_dropout
+ self.feat_proj_dropout = feat_proj_dropout
+ self.final_dropout = final_dropout
+ self.layerdrop = layerdrop
+ self.layer_norm_eps = layer_norm_eps
+ self.initializer_range = initializer_range
+ self.vocab_size = vocab_size
+ self.use_weighted_layer_sum = use_weighted_layer_sum
+
+ if (
+ (len(self.conv_stride) != self.num_feat_extract_layers)
+ or (len(self.conv_kernel) != self.num_feat_extract_layers)
+ or (len(self.conv_dim) != self.num_feat_extract_layers)
+ ):
+ raise ValueError(
+ "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+ " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+ f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+ f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+ )
+
+ # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+ self.mask_time_prob = mask_time_prob
+ self.mask_time_length = mask_time_length
+ self.mask_time_min_masks = mask_time_min_masks
+ self.mask_feature_prob = mask_feature_prob
+ self.mask_feature_length = mask_feature_length
+ self.mask_feature_min_masks = mask_feature_min_masks
+
+ # ctc loss
+ self.ctc_loss_reduction = ctc_loss_reduction
+ self.ctc_zero_infinity = ctc_zero_infinity
+
+ # adapter
+ self.add_adapter = add_adapter
+ self.adapter_kernel_size = adapter_kernel_size
+ self.adapter_stride = adapter_stride
+ self.num_adapter_layers = num_adapter_layers
+ self.output_hidden_size = output_hidden_size or hidden_size
+
+ # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+ self.classifier_proj_size = classifier_proj_size
+
+ # XVector-specific parameters. Feel free to ignore for other classes.
+ self.tdnn_dim = list(tdnn_dim)
+ self.tdnn_kernel = list(tdnn_kernel)
+ self.tdnn_dilation = list(tdnn_dilation)
+ self.xvector_output_dim = xvector_output_dim
+
+ @property
+ def inputs_to_logits_ratio(self):
+ return math.prod(self.conv_stride)
+
+
+__all__ = ["Data2VecAudioConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/configuration_data2vec_text.py b/docs/transformers/build/lib/transformers/models/data2vec/configuration_data2vec_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aa9a6b7bf22a8b0bbe9665ede242fa6e895cf7b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/configuration_data2vec_text.py
@@ -0,0 +1,154 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data2VecText configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Data2VecTextConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
+ is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
+ [facebook/data2vec-text-base](https://huggingface.co/facebook/data2vec-text-base) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 30522):
+ Vocabulary size of the DATA2VEC model. Defines the number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`Data2VecModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+ The dropout ratio for the attention probabilities.
+ max_position_embeddings (`int`, *optional*, defaults to 512):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`Data2VecModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the layer normalization layers.
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+ positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+ [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+ For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+ with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+ is_decoder (`bool`, *optional*, defaults to `False`):
+ Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ classifier_dropout (`float`, *optional*):
+ The dropout ratio for the classification head.
+
+ Examples:
+
+ ```python
+ >>> from transformers import Data2VecTextConfig, Data2VecTextModel
+
+ >>> # Initializing a Data2VecText facebook/data2vec-text-base style configuration
+ >>> configuration = Data2VecTextConfig()
+
+ >>> # Initializing a model (with random weights) from the facebook/data2vec-text-base style configuration
+ >>> model = Data2VecTextModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "data2vec-text"
+
+ def __init__(
+ self,
+ vocab_size=30522,
+ hidden_size=768,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ intermediate_size=3072,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ max_position_embeddings=512,
+ type_vocab_size=2,
+ initializer_range=0.02,
+ layer_norm_eps=1e-12,
+ pad_token_id=1,
+ bos_token_id=0,
+ eos_token_id=2,
+ position_embedding_type="absolute",
+ use_cache=True,
+ classifier_dropout=None,
+ **kwargs,
+ ):
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.hidden_act = hidden_act
+ self.intermediate_size = intermediate_size
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.max_position_embeddings = max_position_embeddings
+ self.type_vocab_size = type_vocab_size
+ self.initializer_range = initializer_range
+ self.layer_norm_eps = layer_norm_eps
+ self.position_embedding_type = position_embedding_type
+ self.use_cache = use_cache
+ self.classifier_dropout = classifier_dropout
+
+
+class Data2VecTextOnnxConfig(OnnxConfig):
+ @property
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
+ if self.task == "multiple-choice":
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+ else:
+ dynamic_axis = {0: "batch", 1: "sequence"}
+ return OrderedDict(
+ [
+ ("input_ids", dynamic_axis),
+ ("attention_mask", dynamic_axis),
+ ]
+ )
+
+
+__all__ = ["Data2VecTextConfig", "Data2VecTextOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/configuration_data2vec_vision.py b/docs/transformers/build/lib/transformers/models/data2vec/configuration_data2vec_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..b822b03ef3eb2028bbb22ab31ac50f3cb6dd4173
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/configuration_data2vec_vision.py
@@ -0,0 +1,194 @@
+# coding=utf-8
+# Copyright Meta Platforms and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data2VecVision model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Data2VecVisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Data2VecVisionModel`]. It is used to instantiate
+ an Data2VecVision model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the Data2VecVision
+ [facebook/data2vec-vision-base](https://huggingface.co/facebook/data2vec-vision-base) architecture.
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the layer normalization layers.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 16):
+ The size (resolution) of each patch.
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ use_mask_token (`bool`, *optional*, defaults to `False`):
+ Whether to use a mask token for masked image modeling.
+ use_absolute_position_embeddings (`bool`, *optional*, defaults to `False`):
+ Whether to use BERT-style absolute position embeddings.
+ use_relative_position_bias (`bool`, *optional*, defaults to `False`):
+ Whether to use T5-style relative position embeddings in the self-attention layers.
+ use_shared_relative_position_bias (`bool`, *optional*, defaults to `False`):
+ Whether to use the same relative position embeddings across all self-attention layers of the Transformer.
+ layer_scale_init_value (`float`, *optional*, defaults to 0.1):
+ Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
+ Stochastic depth rate per sample (when applied in the main path of residual layers).
+ use_mean_pooling (`bool`, *optional*, defaults to `True`):
+ Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
+ CLS token, before applying the classification head.
+ out_indices (`List[int]`, *optional*, defaults to `[3, 5, 7, 11]`):
+ Indices of the feature maps to use for semantic segmentation.
+ pool_scales (`Tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`):
+ Pooling scales used in Pooling Pyramid Module applied on the last feature map.
+ use_auxiliary_head (`bool`, *optional*, defaults to `True`):
+ Whether to use an auxiliary head during training.
+ auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
+ Weight of the cross-entropy loss of the auxiliary head.
+ auxiliary_channels (`int`, *optional*, defaults to 256):
+ Number of channels to use in the auxiliary head.
+ auxiliary_num_convs (`int`, *optional*, defaults to 1):
+ Number of convolutional layers to use in the auxiliary head.
+ auxiliary_concat_input (`bool`, *optional*, defaults to `False`):
+ Whether to concatenate the output of the auxiliary head with the input before the classification layer.
+ semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
+ The index that is ignored by the loss function of the semantic segmentation model.
+
+ Example:
+
+ ```python
+ >>> from transformers import Data2VecVisionConfig, Data2VecVisionModel
+
+ >>> # Initializing a Data2VecVision data2vec_vision-base-patch16-224-in22k style configuration
+ >>> configuration = Data2VecVisionConfig()
+
+ >>> # Initializing a model (with random weights) from the data2vec_vision-base-patch16-224-in22k style configuration
+ >>> model = Data2VecVisionModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "data2vec-vision"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ intermediate_size=3072,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.0,
+ attention_probs_dropout_prob=0.0,
+ initializer_range=0.02,
+ layer_norm_eps=1e-12,
+ image_size=224,
+ patch_size=16,
+ num_channels=3,
+ use_mask_token=False,
+ use_absolute_position_embeddings=False,
+ use_relative_position_bias=False,
+ use_shared_relative_position_bias=False,
+ layer_scale_init_value=0.1,
+ drop_path_rate=0.1,
+ use_mean_pooling=True,
+ out_indices=[3, 5, 7, 11],
+ pool_scales=[1, 2, 3, 6],
+ use_auxiliary_head=True,
+ auxiliary_loss_weight=0.4,
+ auxiliary_channels=256,
+ auxiliary_num_convs=1,
+ auxiliary_concat_input=False,
+ semantic_loss_ignore_index=255,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.initializer_range = initializer_range
+ self.layer_norm_eps = layer_norm_eps
+
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.use_mask_token = use_mask_token
+ self.use_absolute_position_embeddings = use_absolute_position_embeddings
+ self.use_relative_position_bias = use_relative_position_bias
+ self.use_shared_relative_position_bias = use_shared_relative_position_bias
+ self.layer_scale_init_value = layer_scale_init_value
+ self.drop_path_rate = drop_path_rate
+ self.use_mean_pooling = use_mean_pooling
+ # decode head attributes (semantic segmentation)
+ self.out_indices = out_indices
+ self.pool_scales = pool_scales
+ # auxiliary head attributes (semantic segmentation)
+ self.use_auxiliary_head = use_auxiliary_head
+ self.auxiliary_loss_weight = auxiliary_loss_weight
+ self.auxiliary_channels = auxiliary_channels
+ self.auxiliary_num_convs = auxiliary_num_convs
+ self.auxiliary_concat_input = auxiliary_concat_input
+ self.semantic_loss_ignore_index = semantic_loss_ignore_index
+
+
+# Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
+class Data2VecVisionOnnxConfig(OnnxConfig):
+ torch_onnx_minimum_version = version.parse("1.11")
+
+ @property
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
+ return OrderedDict(
+ [
+ ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+ ]
+ )
+
+ @property
+ def atol_for_validation(self) -> float:
+ return 1e-4
+
+
+__all__ = ["Data2VecVisionConfig", "Data2VecVisionOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ecc33355145577a81dc33b4f421a80559ed93ef
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Wav2Vec2 checkpoint."""
+
+import argparse
+import os
+from functools import reduce
+
+import fairseq
+import torch
+from datasets import load_dataset
+
+from transformers import Wav2Vec2Processor, logging
+from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
+
+# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
+from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy # noqa: F401
+from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioForCTC, Data2VecAudioModel
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+MAPPING = {
+ "post_extract_proj": "feature_projection.projection",
+ "models.0.layer_norm": "feature_projection.layer_norm",
+ "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
+ "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
+ "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
+ "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
+ "self_attn_layer_norm": "encoder.layers.*.layer_norm",
+ "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
+ "fc2": "encoder.layers.*.feed_forward.output_dense",
+ "final_layer_norm": "encoder.layers.*.final_layer_norm",
+ "encoder.layer_norm": "encoder.layer_norm",
+ "w2v_model.layer_norm": "feature_projection.layer_norm",
+ "w2v_encoder.proj": "lm_head",
+ "mask_emb": "masked_spec_embed",
+}
+TOP_LEVEL_KEYS = [
+ "lm_head",
+]
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+ for attribute in key.split("."):
+ hf_pointer = getattr(hf_pointer, attribute)
+
+ if weight_type is not None:
+ hf_shape = getattr(hf_pointer, weight_type).shape
+ else:
+ hf_shape = hf_pointer.shape
+
+ if hf_shape != value.shape:
+ raise ValueError(
+ f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+ f" {value.shape} for {full_name}"
+ )
+
+ if weight_type == "weight":
+ hf_pointer.weight.data = value
+ elif weight_type == "weight_g":
+ hf_pointer.weight_g.data = value
+ elif weight_type == "weight_v":
+ hf_pointer.weight_v.data = value
+ elif weight_type == "bias":
+ hf_pointer.bias.data = value
+ else:
+ hf_pointer.data = value
+
+ logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
+
+
+def recursively_load_weights(fairseq_model, hf_model, is_headless):
+ unused_weights = []
+ fairseq_dict = fairseq_model.state_dict()
+
+ if not is_headless:
+ feature_extractor = hf_model.data2vec_audio.feature_extractor
+ pos_conv_embedding = hf_model.data2vec_audio.encoder.pos_conv_embed
+
+ else:
+ feature_extractor = hf_model.feature_extractor
+ pos_conv_embedding = hf_model.encoder.pos_conv_embed
+
+ for name, value in fairseq_dict.items():
+ is_used = False
+ if "conv_layers" in name:
+ load_conv_layer(
+ name,
+ value,
+ feature_extractor,
+ unused_weights,
+ )
+ is_used = True
+ elif "pos_conv" in name:
+ load_pos_conv_layer(
+ name,
+ value,
+ pos_conv_embedding,
+ unused_weights,
+ )
+ is_used = True
+ else:
+ for key, mapped_key in MAPPING.items():
+ if not is_headless:
+ mapped_key = "data2vec_audio." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+ if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+ is_used = True
+ if "*" in mapped_key:
+ layer_index = name.split(key)[0].split(".")[-2]
+ mapped_key = mapped_key.replace("*", layer_index)
+ if "weight_g" in name:
+ weight_type = "weight_g"
+ elif "weight_v" in name:
+ weight_type = "weight_v"
+ elif "bias" in name:
+ weight_type = "bias"
+ elif "weight" in name:
+ # TODO: don't match quantizer.weight_proj
+ weight_type = "weight"
+ else:
+ weight_type = None
+ set_recursively(hf_model, mapped_key, value, name, weight_type)
+ continue
+ if not is_used:
+ unused_weights.append(name)
+
+ logger.warning(f"Unused weights: {unused_weights}")
+
+
+def access_by_string(module, path):
+ names = path.split(".")
+ return reduce(getattr, names, module)
+
+
+def set_weights(full_name, module, fsq_value, hf_weight_path):
+ hf_weight = access_by_string(module, hf_weight_path)
+ hf_value = hf_weight.data
+
+ if fsq_value.shape != hf_value.shape:
+ raise ValueError(f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found.")
+ hf_weight.data = fsq_value
+ logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.")
+
+
+def load_conv_layer(full_name, value, feature_extractor, unused_weights):
+ name = full_name.split("conv_layers.")[-1]
+ items = name.split(".")
+ layer_id = int(items[0])
+ type_id = int(items[1])
+
+ weight_type = name.split(".")[-1]
+ if type_id == 0:
+ layer_type = "conv"
+ elif type_id == 2:
+ layer_type = "layer_norm"
+ else:
+ unused_weights.append(full_name)
+ return
+
+ set_weights(full_name, feature_extractor, value, f"conv_layers.{layer_id}.{layer_type}.{weight_type}")
+
+
+def load_pos_conv_layer(full_name, value, pos_conv_embeddings, unused_weights):
+ name = full_name.split("pos_conv.")[-1]
+ items = name.split(".")
+ layer_id = int(items[0])
+ type_id = int(items[1])
+
+ weight_type = name.split(".")[-1]
+ if type_id != 0:
+ unused_weights.append(full_name)
+ return
+ else:
+ layer_type = "conv"
+
+ set_weights(full_name, pos_conv_embeddings, value, f"layers.{layer_id}.{layer_type}.{weight_type}")
+
+
+@torch.no_grad()
+def convert_wav2vec2_checkpoint(
+ checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
+):
+ """
+ Copy/paste/tweak model's weights to transformers design.
+ """
+ if config_path is not None:
+ config = Data2VecAudioConfig.from_pretrained(config_path)
+ else:
+ config = Data2VecAudioConfig()
+
+ if not is_finetuned:
+ # Modify final_proj layer name
+ hf_wav2vec = Data2VecAudioModel(config)
+ data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
+
+ state_dict = torch.load(checkpoint_path, weights_only=True)
+ state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
+ state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
+ converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
+ torch.save(state_dict, converted_ckpt)
+ else:
+ hf_wav2vec = Data2VecAudioForCTC(config)
+ converted_ckpt = checkpoint_path
+
+ def load_data2vec(path):
+ model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([path])
+ return model[0].eval()
+
+ model = load_data2vec(converted_ckpt)
+
+ recursively_load_weights(model, hf_wav2vec, not is_finetuned)
+
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
+
+ ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
+ input_audio = [x["array"] for x in ds[:4]["audio"]]
+
+ inputs = processor(input_audio, return_tensors="pt", padding=True)
+
+ input_values = inputs.input_values
+ attention_mask = inputs.attention_mask
+ # input_values = inputs.input_values[:, :-1]
+ # attention_mask = inputs.attention_mask[:, :-1]
+
+ hf_wav2vec.eval()
+ model.eval()
+ if is_finetuned:
+ their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
+ "encoder_out"
+ ].transpose(0, 1)
+ our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"]
+
+ pred_ids = torch.argmax(our_output, dim=-1)
+ output_string = processor.batch_decode(pred_ids)
+
+ print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}")
+ else:
+ their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
+ "layer_results"
+ ][-1][0].transpose(0, 1)
+ our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"]
+
+ print(our_output.shape, their_output.shape)
+ max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+ print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
+ success = torch.allclose(our_output, their_output, atol=1e-3)
+ print("Do both models output the same tensors?", "🔥" if success else "💩")
+ if not success:
+ raise Exception("Something went wRoNg")
+
+ hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
+
+ if is_finetuned:
+ processor.save_pretrained(pytorch_dump_folder_path)
+ else:
+ processor.feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+ parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+ parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
+ parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+ parser.add_argument(
+ "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
+ )
+ args = parser.parse_args()
+ convert_wav2vec2_checkpoint(
+ args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
+ )
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..93c9afe9f65bd914a725110f34650fb26b462694
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,207 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert data2vec checkpoint."""
+
+import argparse
+import os
+import pathlib
+
+import fairseq
+import torch
+from fairseq.modules import TransformerSentenceEncoderLayer
+from packaging import version
+
+from transformers import (
+ Data2VecTextConfig,
+ Data2VecTextForMaskedLM,
+ Data2VecTextForSequenceClassification,
+ Data2VecTextModel,
+)
+from transformers.models.bert.modeling_bert import (
+ BertIntermediate,
+ BertLayer,
+ BertOutput,
+ BertSelfAttention,
+ BertSelfOutput,
+)
+
+# IMPORTANT: In order for this script to run, please make sure to download the dictionary: `dict.txt` from wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
+# File copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
+from transformers.utils import logging
+
+
+if version.parse(fairseq.__version__) < version.parse("0.9.0"):
+ raise Exception("requires fairseq >= 0.9.0")
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+SAMPLE_TEXT = "Hello world! cécé herlolip"
+
+
+def convert_data2vec_checkpoint_to_pytorch(
+ data2vec_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
+):
+ """
+ Copy/paste/tweak data2vec's weights to our BERT structure.
+ """
+ data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path)
+ data2vec = Data2VecTextModel.from_pretrained(
+ data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name
+ )
+ data2vec.eval() # disable dropout
+ data2vec_model = data2vec.models[0]
+ data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
+ config = Data2VecTextConfig(
+ vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,
+ hidden_size=data2vec_model.args.encoder_embed_dim,
+ num_hidden_layers=data2vec_model.args.encoder_layers,
+ num_attention_heads=data2vec_model.args.encoder_attention_heads,
+ intermediate_size=data2vec_model.args.encoder_ffn_embed_dim,
+ max_position_embeddings=514,
+ type_vocab_size=1,
+ layer_norm_eps=1e-5, # PyTorch default used in fairseq
+ )
+ if classification_head:
+ config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0]
+ print("Our BERT config:", config)
+
+ model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config)
+ model.eval()
+
+ # Now let's copy all the weights.
+ # Embeddings
+ model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
+ model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
+ model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+ model.data2vec_text.embeddings.token_type_embeddings.weight
+ ) # just zero them out b/c data2vec doesn't use them.
+ model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
+ model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
+
+ for i in range(config.num_hidden_layers):
+ # Encoder: start of layer
+ layer: BertLayer = model.data2vec_text.encoder.layer[i]
+ data2vec_layer: TransformerSentenceEncoderLayer = data2vec_sent_encoder.layers[i]
+
+ # self attention
+ self_attn: BertSelfAttention = layer.attention.self
+ assert data2vec_layer.self_attn.k_proj.weight.data.shape == torch.Size(
+ (config.hidden_size, config.hidden_size)
+ ), (
+ "Shape for data2vec_layer.self_attn.k_proj.weight.data should be"
+ f" {torch.Size((config.hidden_size, config.hidden_size))}"
+ )
+ assert data2vec_layer.self_attn.q_proj.weight.data.shape == torch.Size(
+ (config.hidden_size, config.hidden_size)
+ ), (
+ "Shape for data2vec_layer.self_attn.q_proj.weight.data should be"
+ f" {torch.Size((config.hidden_size, config.hidden_size))}"
+ )
+ assert data2vec_layer.self_attn.v_proj.weight.data.shape == torch.Size(
+ (config.hidden_size, config.hidden_size)
+ ), (
+ "Shape for data2vec_layer.self_attn.v_proj.weight.data should be"
+ f" {torch.Size((config.hidden_size, config.hidden_size))}"
+ )
+
+ self_attn.query.weight.data = data2vec_layer.self_attn.q_proj.weight
+ self_attn.query.bias.data = data2vec_layer.self_attn.q_proj.bias
+ self_attn.key.weight.data = data2vec_layer.self_attn.k_proj.weight
+ self_attn.key.bias.data = data2vec_layer.self_attn.k_proj.bias
+ self_attn.value.weight.data = data2vec_layer.self_attn.v_proj.weight
+ self_attn.value.bias.data = data2vec_layer.self_attn.v_proj.bias
+
+ # self-attention output
+ self_output: BertSelfOutput = layer.attention.output
+ assert self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape, (
+ f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
+ )
+ self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
+ self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
+ self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
+ self_output.LayerNorm.bias = data2vec_layer.self_attn_layer_norm.bias
+
+ # intermediate
+ intermediate: BertIntermediate = layer.intermediate
+ assert intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape, (
+ f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
+ )
+ intermediate.dense.weight = data2vec_layer.fc1.weight
+ intermediate.dense.bias = data2vec_layer.fc1.bias
+
+ # output
+ bert_output: BertOutput = layer.output
+ assert bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape, (
+ f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
+ )
+ bert_output.dense.weight = data2vec_layer.fc2.weight
+ bert_output.dense.bias = data2vec_layer.fc2.bias
+ bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight
+ bert_output.LayerNorm.bias = data2vec_layer.final_layer_norm.bias
+ # end of layer
+
+ if classification_head:
+ model.classifier.dense.weight = data2vec.model.classification_heads["mnli"].dense.weight
+ model.classifier.dense.bias = data2vec.model.classification_heads["mnli"].dense.bias
+ model.classifier.out_proj.weight = data2vec.model.classification_heads["mnli"].out_proj.weight
+ model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias
+ else:
+ # LM Head
+ model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight
+ model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias
+ model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight
+ model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias
+ model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight
+ model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias
+
+ # Let's check that we get the same results.
+ input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
+
+ our_output = model(input_ids)[0]
+ if classification_head:
+ their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids))
+ else:
+ their_output = data2vec_model(input_ids)[0]
+ print(our_output.shape, their_output.shape)
+ max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+ print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
+ success = torch.allclose(our_output, their_output, atol=1e-3)
+ print("Do both models output the same tensors?", "🔥" if success else "💩")
+ if not success:
+ raise Exception("Something went wRoNg")
+
+ pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
+ print(f"Saving model to {pytorch_dump_folder_path}")
+ model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ # Required parameters
+ parser.add_argument(
+ "--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+ )
+ parser.add_argument(
+ "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+ )
+ parser.add_argument(
+ "--classification_head", action="store_true", help="Whether to convert a final classification head."
+ )
+ args = parser.parse_args()
+ convert_data2vec_checkpoint_to_pytorch(
+ args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
+ )
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f9d7773516aa68c4d0b789062a3e9bd6abd411a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,374 @@
+#!/usr/bin/env python3
+import argparse
+import json
+
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from timm.models import create_model
+
+from transformers import (
+ BeitImageProcessor,
+ Data2VecVisionConfig,
+ Data2VecVisionForImageClassification,
+ Data2VecVisionModel,
+)
+
+
+def create_rename_keys(config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec."):
+ prefix = "backbone." if is_semantic else ""
+
+ rename_keys = []
+ for i in range(config.num_hidden_layers):
+ # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+ rename_keys.append(
+ (f"{prefix}blocks.{i}.norm1.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_before.weight")
+ )
+ rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_before.bias"))
+ rename_keys.append(
+ (f"{prefix}blocks.{i}.attn.proj.weight", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.weight")
+ )
+ rename_keys.append(
+ (f"{prefix}blocks.{i}.attn.proj.bias", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.bias")
+ )
+ rename_keys.append(
+ (f"{prefix}blocks.{i}.norm2.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_after.weight")
+ )
+ rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_after.bias"))
+ rename_keys.append(
+ (f"{prefix}blocks.{i}.mlp.fc1.weight", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.weight")
+ )
+ rename_keys.append(
+ (f"{prefix}blocks.{i}.mlp.fc1.bias", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.bias")
+ )
+ rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"{hf_prefix}encoder.layer.{i}.output.dense.weight"))
+ rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"{hf_prefix}encoder.layer.{i}.output.dense.bias"))
+
+ # projection layer + position embeddings
+ rename_keys.extend(
+ [
+ (f"{prefix}cls_token", f"{hf_prefix}embeddings.cls_token"),
+ (f"{prefix}patch_embed.proj.weight", f"{hf_prefix}embeddings.patch_embeddings.projection.weight"),
+ (f"{prefix}patch_embed.proj.bias", f"{hf_prefix}embeddings.patch_embeddings.projection.bias"),
+ ]
+ )
+
+ if has_lm_head:
+ # mask token + shared relative position bias + layernorm
+ rename_keys.extend(
+ [
+ ("mask_token", f"{hf_prefix}embeddings.mask_token"),
+ (
+ "rel_pos_bias.relative_position_bias_table",
+ f"{hf_prefix}encoder.relative_position_bias.relative_position_bias_table",
+ ),
+ (
+ "rel_pos_bias.relative_position_index",
+ f"{hf_prefix}encoder.relative_position_bias.relative_position_index",
+ ),
+ ("norm.weight", "layernorm.weight"),
+ ("norm.bias", "layernorm.bias"),
+ ]
+ )
+ elif is_semantic:
+ # semantic segmentation classification heads
+ rename_keys.extend(
+ [
+ ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
+ ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
+ ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
+ ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
+ ]
+ )
+ else:
+ # layernorm + classification head
+ rename_keys.extend(
+ [
+ ("fc_norm.weight", f"{hf_prefix}pooler.layernorm.weight"),
+ ("fc_norm.bias", f"{hf_prefix}pooler.layernorm.bias"),
+ ("head.weight", "classifier.weight"),
+ ("head.bias", "classifier.bias"),
+ ]
+ )
+
+ return rename_keys
+
+
+def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec_vision."):
+ for i in range(config.num_hidden_layers):
+ prefix = "backbone." if is_semantic else ""
+ # queries, keys and values
+ in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
+ q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
+ v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
+
+ state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+ : config.hidden_size, :
+ ]
+ state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.bias"] = q_bias
+ state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+ config.hidden_size : config.hidden_size * 2, :
+ ]
+ state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+ -config.hidden_size :, :
+ ]
+ state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.bias"] = v_bias
+
+ # gamma_1 and gamma_2
+ # we call them lambda because otherwise they are renamed when using .from_pretrained
+ gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
+ gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
+
+ state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_1"] = gamma_1
+ state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_2"] = gamma_2
+
+ # relative_position bias table + index
+ if not has_lm_head:
+ # each layer has its own relative position bias
+ table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table")
+ index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index")
+
+ state_dict[
+ f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"
+ ] = table
+ state_dict[
+ f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"
+ ] = index
+
+
+def get_args():
+ parser = argparse.ArgumentParser(
+ "Convert Data2VecVision to HF for image classification and pretraining", add_help=False
+ )
+ parser.add_argument("--hf_checkpoint_name", type=str)
+ parser.add_argument("--input_size", default=224, type=int, help="images input size")
+ parser.add_argument("--beit_checkpoint", default="", help="beit checkpoint")
+
+ return parser.parse_args()
+
+
+def load_beit_model(args, is_finetuned, is_large):
+ def load_state_dict(model, state_dict, prefix="", ignore_missing="relative_position_index"):
+ missing_keys = []
+ unexpected_keys = []
+ error_msgs = []
+ # copy state_dict so _load_from_state_dict can modify it
+ metadata = getattr(state_dict, "_metadata", None)
+ state_dict = state_dict.copy()
+ if metadata is not None:
+ state_dict._metadata = metadata
+
+ def load(module, prefix=""):
+ local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+ module._load_from_state_dict(
+ state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
+ )
+ for name, child in module._modules.items():
+ if child is not None:
+ load(child, prefix + name + ".")
+
+ load(model, prefix=prefix)
+
+ warn_missing_keys = []
+ ignore_missing_keys = []
+ for key in missing_keys:
+ keep_flag = True
+ for ignore_key in ignore_missing.split("|"):
+ if ignore_key in key:
+ keep_flag = False
+ break
+ if keep_flag:
+ warn_missing_keys.append(key)
+ else:
+ ignore_missing_keys.append(key)
+
+ missing_keys = warn_missing_keys
+
+ if len(missing_keys) > 0:
+ print(
+ "Weights of {} not initialized from pretrained model: {}".format(
+ model.__class__.__name__, missing_keys
+ )
+ )
+ if len(unexpected_keys) > 0:
+ print("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys))
+ if len(ignore_missing_keys) > 0:
+ print(
+ "Ignored weights of {} not initialized from pretrained model: {}".format(
+ model.__class__.__name__, ignore_missing_keys
+ )
+ )
+ if len(error_msgs) > 0:
+ print("\n".join(error_msgs))
+
+ model_kwargs = {
+ "pretrained": False,
+ "use_shared_rel_pos_bias": True,
+ "use_abs_pos_emb": False,
+ "init_values": 0.1,
+ }
+
+ if is_finetuned:
+ model_kwargs.update(
+ {
+ "num_classes": 1000,
+ "use_mean_pooling": True,
+ "init_scale": 0.001,
+ "use_rel_pos_bias": True,
+ }
+ )
+
+ model = create_model(
+ "beit_large_patch16_224" if is_large else "beit_base_patch16_224",
+ **model_kwargs,
+ )
+ patch_size = model.patch_embed.patch_size
+ args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1])
+ checkpoint = torch.load(args.beit_checkpoint, map_location="cpu", weights_only=True)
+
+ print(f"Load ckpt from {args.beit_checkpoint}")
+ checkpoint_model = None
+ for model_key in ("model", "module"):
+ if model_key in checkpoint:
+ checkpoint_model = checkpoint[model_key]
+ print(f"Load state_dict by model_key = {model_key}")
+ break
+
+ all_keys = list(checkpoint_model.keys())
+ for key in all_keys:
+ if "relative_position_index" in key:
+ checkpoint_model.pop(key)
+
+ if "relative_position_bias_table" in key:
+ rel_pos_bias = checkpoint_model[key]
+ src_num_pos, num_attn_heads = rel_pos_bias.size()
+ dst_num_pos, _ = model.state_dict()[key].size()
+ dst_patch_shape = model.patch_embed.patch_shape
+ if dst_patch_shape[0] != dst_patch_shape[1]:
+ raise NotImplementedError()
+
+ load_state_dict(model, checkpoint_model, prefix="")
+
+ return model
+
+
+def main():
+ args = get_args()
+
+ is_finetuned = "ft1k" in args.hf_checkpoint_name
+ is_large = "large" in args.hf_checkpoint_name
+
+ if is_finetuned:
+ # To convert Beit's data2vec_vision to HF you need to copy
+ # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py
+ # into this folder.
+ import modeling_finetune # noqa: F401
+ else:
+ # To convert Beit's data2vec_vision to HF you need to copy
+ # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py
+ # into this folder
+ # IMPORTANT: Note that for now we've only converted the down-stream
+ # model and not the full pretrained model. This means for the integration
+ # test you need to add a `return x` after the following line:
+ # https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197
+ # to make the integration test pass.
+ import modeling_cyclical # noqa: F401
+
+ # 1. Create model config
+ config = Data2VecVisionConfig()
+ if is_finetuned:
+ config.use_relative_position_bias = True
+ config.use_shared_relative_position_bias = False
+ config.use_mean_pooling = True
+ config.num_labels = 1000
+
+ repo_id = "huggingface/label-files"
+ filename = "imagenet-1k-id2label.json"
+ id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+ id2label = {int(k): v for k, v in id2label.items()}
+ config.id2label = id2label
+ config.label2id = {v: k for k, v in id2label.items()}
+ else:
+ config.use_relative_position_bias = False
+ config.use_shared_relative_position_bias = True
+ config.use_mean_pooling = False
+
+ if is_large:
+ config.hidden_size = 1024
+ config.intermediate_size = 4096
+ config.num_hidden_layers = 24
+ config.num_attention_heads = 16
+
+ # 2. Load Beit model
+ orig_model = load_beit_model(args, is_finetuned, is_large)
+ orig_model.eval()
+
+ # 3. Forward Beit model
+ image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
+ image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
+ encoding = image_processor(images=image, return_tensors="pt")
+ pixel_values = encoding["pixel_values"]
+
+ orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
+ with torch.no_grad():
+ orig_model_output = orig_model(*orig_args)
+
+ # 4. Load HF Data2VecVision model
+ if is_finetuned:
+ hf_model = Data2VecVisionForImageClassification(config)
+ hf_model.eval()
+ has_lm_head = False
+ hf_prefix = "data2vec_vision."
+ else:
+ hf_model = Data2VecVisionModel(config)
+ hf_model.eval()
+ has_lm_head = True
+ hf_prefix = ""
+
+ rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
+ state_dict = orig_model.state_dict()
+ for src, dest in rename_keys:
+ val = state_dict.pop(src)
+ state_dict[dest] = val
+
+ read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
+ missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
+ print("HF missing", missing_keys)
+ print("HF unexpected_keys", unexpected_keys)
+
+ # 5. Forward HF Data2VecVision model
+ with torch.no_grad():
+ hf_model_output = hf_model(pixel_values)
+
+ hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state
+
+ # 6. Compare
+ max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item()
+
+ print(f"max_absolute_diff = {max_absolute_diff}")
+ success = torch.allclose(hf_output, orig_model_output, atol=1e-3)
+ print("Do both models output the same tensors?", "🔥" if success else "💩")
+ if not success:
+ raise Exception("Something went wRoNg")
+
+ # 7. Save
+ print(f"Saving to {args.hf_checkpoint_name}")
+ hf_model.save_pretrained(args.hf_checkpoint_name)
+ image_processor.save_pretrained(args.hf_checkpoint_name)
+
+
+if __name__ == "__main__":
+ main()
+ # Run the following to convert checkpoints
+ # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
+ # --beit_checkpoint ./pretrained_base.pt \
+ # --hf_checkpoint_name "./data2vec-vision-base"
+ # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
+ # --beit_checkpoint ./finetuned_base.pt \
+ # --hf_checkpoint_name "./data2vec-vision-base-ft1k"
+ # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
+ # --beit_checkpoint ./pretrained_large.pt \
+ # --hf_checkpoint_name "./data2vec-vision-large"
+ # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
+ # --beit_checkpoint ./finetuned_large.pt \
+ # --hf_checkpoint_name "./data2vec-vision-large-ft1k"
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/modeling_data2vec_audio.py b/docs/transformers/build/lib/transformers/models/data2vec/modeling_data2vec_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f799e812970335de33110a9be8a648db8303b04
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -0,0 +1,1746 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/data2vec/modular_data2vec_audio.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_data2vec_audio.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_outputs import (
+ BaseModelOutput,
+ CausalLMOutput,
+ SequenceClassifierOutput,
+ TokenClassifierOutput,
+ Wav2Vec2BaseModelOutput,
+ XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_peft_available,
+ logging,
+)
+from .configuration_data2vec_audio import Data2VecAudioConfig
+
+
+if is_flash_attn_available():
+ from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-audio-base-960h"
+
+# General docstring
+_CONFIG_FOR_DOC = "Data2VecAudioConfig"
+
+
+class Data2VecAudioConvLayer(nn.Module):
+ def __init__(self, config, layer_id=0):
+ super().__init__()
+ self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+ self.out_conv_dim = config.conv_dim[layer_id]
+
+ self.conv = nn.Conv1d(
+ self.in_conv_dim,
+ self.out_conv_dim,
+ kernel_size=config.conv_kernel[layer_id],
+ stride=config.conv_stride[layer_id],
+ bias=config.conv_bias,
+ )
+ self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+ self.activation = ACT2FN[config.feat_extract_activation]
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+
+ hidden_states = hidden_states.transpose(-2, -1)
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = hidden_states.transpose(-2, -1)
+
+ hidden_states = self.activation(hidden_states)
+ return hidden_states
+
+
+class Data2VecAudioPadLayer(nn.Module):
+ def __init__(self, num_conv_pos_embeddings):
+ super().__init__()
+ self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+ def forward(self, hidden_states):
+ if self.num_pad_remove > 0:
+ hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+ return hidden_states
+
+
+class Data2VecAudioPositionalConvLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.conv = nn.Conv1d(
+ config.hidden_size,
+ config.hidden_size,
+ kernel_size=config.conv_pos_kernel_size,
+ padding=config.conv_pos_kernel_size // 2,
+ groups=config.num_conv_pos_embedding_groups,
+ )
+
+ self.padding = Data2VecAudioPadLayer(config.conv_pos_kernel_size)
+ self.activation = ACT2FN[config.feat_extract_activation]
+ # no learnable parameters
+ self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+ hidden_states = self.padding(hidden_states)
+
+ hidden_states = hidden_states.transpose(1, 2)
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = hidden_states.transpose(1, 2)
+ hidden_states = self.activation(hidden_states)
+ return hidden_states
+
+
+class Data2VecAudioPositionalConvEmbedding(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.layers = nn.ModuleList(
+ [Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)]
+ )
+
+ def forward(self, hidden_states):
+ hidden_states = hidden_states.transpose(1, 2)
+ for layer in self.layers:
+ hidden_states = layer(hidden_states)
+ hidden_states = hidden_states.transpose(1, 2)
+ return hidden_states
+
+
+class Data2VecAudioFeatureEncoder(nn.Module):
+ """Construct the features from raw audio waveform"""
+
+ def __init__(self, config):
+ super().__init__()
+ self.conv_layers = nn.ModuleList(
+ [Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+ )
+ self.gradient_checkpointing = False
+ self._requires_grad = True
+
+ def _freeze_parameters(self):
+ for param in self.parameters():
+ param.requires_grad = False
+ self._requires_grad = False
+
+ def forward(self, input_values):
+ hidden_states = input_values[:, None]
+
+ # make sure hidden_states require grad for gradient_checkpointing
+ if self._requires_grad and self.training:
+ hidden_states.requires_grad = True
+
+ for conv_layer in self.conv_layers:
+ if self._requires_grad and self.gradient_checkpointing and self.training:
+ hidden_states = self._gradient_checkpointing_func(
+ conv_layer.__call__,
+ hidden_states,
+ )
+ else:
+ hidden_states = conv_layer(hidden_states)
+
+ return hidden_states
+
+
+class Data2VecAudioFeatureProjection(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+ self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+ self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+ def forward(self, hidden_states):
+ # non-projected hidden states are needed for quantization
+ norm_hidden_states = self.layer_norm(hidden_states)
+ hidden_states = self.projection(norm_hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ return hidden_states, norm_hidden_states
+
+
+class Data2VecAudioAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(
+ self,
+ embed_dim: int,
+ num_heads: int,
+ dropout: float = 0.0,
+ is_decoder: bool = False,
+ bias: bool = True,
+ is_causal: bool = False,
+ config: Optional[Data2VecAudioConfig] = None,
+ ):
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.head_dim = embed_dim // num_heads
+ self.config = config
+
+ if (self.head_dim * num_heads) != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+ f" and `num_heads`: {num_heads})."
+ )
+ self.scaling = self.head_dim**-0.5
+ self.is_decoder = is_decoder
+ self.is_causal = is_causal
+
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ key_value_states: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ layer_head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+
+ # if key_value_states are provided this layer is used as a cross-attention layer
+ # for the decoder
+ is_cross_attention = key_value_states is not None
+
+ bsz, tgt_len, _ = hidden_states.size()
+
+ # get query proj
+ query_states = self.q_proj(hidden_states) * self.scaling
+ # get key, value proj
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
+ # the provided `key_value_states` to support prefix tuning
+ if (
+ is_cross_attention
+ and past_key_value is not None
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
+ ):
+ # reuse k,v, cross_attentions
+ key_states = past_key_value[0]
+ value_states = past_key_value[1]
+ elif is_cross_attention:
+ # cross_attentions
+ key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+ value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+ elif past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+ else:
+ # self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_states, value_states)
+
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+ key_states = key_states.reshape(*proj_shape)
+ value_states = value_states.reshape(*proj_shape)
+
+ src_len = key_states.size(1)
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+ if layer_head_mask is not None:
+ if layer_head_mask.size() != (self.num_heads,):
+ raise ValueError(
+ f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+ f" {layer_head_mask.size()}"
+ )
+ attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+ if output_attentions:
+ # this operation is a bit awkward, but it's required to
+ # make sure that attn_weights keeps its gradient.
+ # In order to do so, attn_weights have to be reshaped
+ # twice and have to be reused in the following
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+ else:
+ attn_weights_reshaped = None
+
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+ attn_output = torch.bmm(attn_probs, value_states)
+
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+ attn_output = attn_output.transpose(1, 2)
+
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+ # partitioned across GPUs when using tensor-parallelism.
+ attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+ attn_output = self.out_proj(attn_output)
+
+ return attn_output, attn_weights_reshaped, past_key_value
+
+
+class Data2VecAudioFlashAttention2(Data2VecAudioAttention):
+ """
+ Data2VecAudio flash attention module. This module inherits from `Data2VecAudioAttention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+ self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+ def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ key_value_states: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ layer_head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # Data2VecAudioFlashAttention2 attention does not support output_attentions
+ if output_attentions:
+ raise ValueError("Data2VecAudioFlashAttention2 attention does not support output_attentions")
+
+ # if key_value_states are provided this layer is used as a cross-attention layer
+ # for the decoder
+ is_cross_attention = key_value_states is not None
+
+ bsz, q_len, _ = hidden_states.size()
+
+ # get query proj
+ query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
+ # get key, value proj
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
+ # the provided `key_value_states` to support prefix tuning
+ if (
+ is_cross_attention
+ and past_key_value is not None
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
+ ):
+ # reuse k,v, cross_attentions
+ key_states = past_key_value[0].transpose(1, 2)
+ value_states = past_key_value[1].transpose(1, 2)
+ elif is_cross_attention:
+ # cross_attentions
+ key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
+ value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
+ elif past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+ key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
+ value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
+ else:
+ # self_attention
+ key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in the correct dtype just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32. (LlamaRMSNorm handles it correctly)
+
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, "_pre_quantization_dtype"):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = self.q_proj.weight.dtype
+
+ logger.warning_once(
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+ f" {target_dtype}."
+ )
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ attn_output = _flash_attention_forward(
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ q_len,
+ dropout=self.dropout if self.training else 0.0,
+ is_causal=self.is_causal,
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
+ )
+
+ attn_output = attn_output.reshape(bsz, q_len, -1)
+ attn_output = self.out_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class Data2VecAudioSdpaAttention(Data2VecAudioAttention):
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ key_value_states: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ layer_head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+ if output_attentions or layer_head_mask is not None:
+ # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+ logger.warning_once(
+ "Data2VecAudioModel is using Data2VecAudioSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+ ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ return super().forward(
+ hidden_states,
+ key_value_states=key_value_states,
+ past_key_value=past_key_value,
+ attention_mask=attention_mask,
+ layer_head_mask=layer_head_mask,
+ output_attentions=output_attentions,
+ )
+
+ # if key_value_states are provided this layer is used as a cross-attention layer
+ # for the decoder
+ is_cross_attention = key_value_states is not None
+
+ bsz, tgt_len, _ = hidden_states.size()
+
+ # get query proj
+ query_states = self.q_proj(hidden_states)
+ # get key, value proj
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
+ # the provided `key_value_states` to support prefix tuning
+ if (
+ is_cross_attention
+ and past_key_value is not None
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
+ ):
+ # reuse k,v, cross_attentions
+ key_states = past_key_value[0]
+ value_states = past_key_value[1]
+ elif is_cross_attention:
+ # cross_attentions
+ key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+ value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+ elif past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+ else:
+ # self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_states, value_states)
+
+ query_states = self._shape(query_states, tgt_len, bsz)
+
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
+ # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+ # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
+ query_states,
+ key_states,
+ value_states,
+ attn_mask=attention_mask,
+ dropout_p=self.dropout if self.training else 0.0,
+ is_causal=is_causal,
+ )
+
+ if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2)
+
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+ # partitioned across GPUs when using tensor-parallelism.
+ attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+ attn_output = self.out_proj(attn_output)
+
+ return attn_output, None, past_key_value
+
+
+class Data2VecAudioFeedForward(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+ self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+ def forward(self, hidden_states):
+ hidden_states = self.intermediate_dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ hidden_states = self.intermediate_dropout(hidden_states)
+
+ hidden_states = self.output_dense(hidden_states)
+ hidden_states = self.output_dropout(hidden_states)
+ return hidden_states
+
+
+DATA2VEC_AUDIO_ATTENTION_CLASSES = {
+ "eager": Data2VecAudioAttention,
+ "sdpa": Data2VecAudioSdpaAttention,
+ "flash_attention_2": Data2VecAudioFlashAttention2,
+}
+
+
+class Data2VecAudioEncoderLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.attention = DATA2VEC_AUDIO_ATTENTION_CLASSES[config._attn_implementation](
+ embed_dim=config.hidden_size,
+ num_heads=config.num_attention_heads,
+ dropout=config.attention_dropout,
+ is_decoder=False,
+ )
+
+ self.dropout = nn.Dropout(config.hidden_dropout)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.feed_forward = Data2VecAudioFeedForward(config)
+ self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+ attn_residual = hidden_states
+ hidden_states, attn_weights, _ = self.attention(
+ hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+ )
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = attn_residual + hidden_states
+
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = hidden_states + self.feed_forward(hidden_states)
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+class Data2VecAudioEncoder(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.pos_conv_embed = Data2VecAudioPositionalConvEmbedding(config)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout)
+ self.layers = nn.ModuleList([Data2VecAudioEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+ def forward(
+ self,
+ hidden_states: torch.tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ ):
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ if attention_mask is not None:
+ # make sure padded tokens output 0
+ expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+ hidden_states[~expand_attention_mask] = 0
+ if self._use_flash_attention_2:
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ else:
+ # extend attention_mask
+ attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+ attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+ attention_mask = attention_mask.expand(
+ attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+ )
+
+ position_embeddings = self.pos_conv_embed(hidden_states)
+ hidden_states = hidden_states + position_embeddings
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+ for layer in self.layers:
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+ dropout_probability = torch.rand([])
+
+ skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+ if not skip_the_layer or synced_gpus:
+ # under fsdp or deepspeed zero3 all gpus must run in sync
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer.__call__,
+ hidden_states,
+ attention_mask,
+ output_attentions,
+ )
+ else:
+ layer_outputs = layer(
+ hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+ )
+ hidden_states = layer_outputs[0]
+
+ if skip_the_layer:
+ layer_outputs = (None, None)
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+class Data2VecAudioAdapterLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.conv = nn.Conv1d(
+ config.output_hidden_size,
+ 2 * config.output_hidden_size,
+ config.adapter_kernel_size,
+ stride=config.adapter_stride,
+ padding=1,
+ )
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+ hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+ return hidden_states
+
+
+class Data2VecAudioAdapter(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+
+ # feature dim might need to be down-projected
+ if config.output_hidden_size != config.hidden_size:
+ self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+ self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+ else:
+ self.proj = self.proj_layer_norm = None
+
+ self.layers = nn.ModuleList(Data2VecAudioAdapterLayer(config) for _ in range(config.num_adapter_layers))
+ self.layerdrop = config.layerdrop
+
+ def forward(self, hidden_states):
+ # down project hidden_states if necessary
+ if self.proj is not None and self.proj_layer_norm is not None:
+ hidden_states = self.proj(hidden_states)
+ hidden_states = self.proj_layer_norm(hidden_states)
+
+ hidden_states = hidden_states.transpose(1, 2)
+
+ for layer in self.layers:
+ layerdrop_prob = np.random.random()
+ if not self.training or (layerdrop_prob > self.layerdrop):
+ hidden_states = layer(hidden_states)
+
+ hidden_states = hidden_states.transpose(1, 2)
+ return hidden_states
+
+
+class Data2VecAudioPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = Data2VecAudioConfig
+ base_model_prefix = "data2vec_audio"
+ main_input_name = "input_values"
+ supports_gradient_checkpointing = True
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, Data2VecAudioFeatureProjection):
+ k = math.sqrt(1 / module.projection.in_features)
+ nn.init.uniform_(module.projection.weight, a=-k, b=k)
+ nn.init.uniform_(module.projection.bias, a=-k, b=k)
+ elif isinstance(module, Data2VecAudioPositionalConvLayer):
+ nn.init.constant_(module.conv.bias, 0)
+ elif isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+ if module.bias is not None:
+ module.bias.data.zero_()
+ if module.weight is not None:
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, nn.Conv1d):
+ nn.init.kaiming_normal_(module.weight)
+
+ if module.bias is not None:
+ k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+ nn.init.uniform_(module.bias, a=-k, b=k)
+
+ def _get_feat_extract_output_lengths(
+ self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+ ):
+ """
+ Computes the output length of the convolutional layers
+ """
+
+ add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+ def _conv_out_length(input_length, kernel_size, stride):
+ # 1D convolutional layer output length formula taken
+ # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+ return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+ for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+ input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+ if add_adapter:
+ for _ in range(self.config.num_adapter_layers):
+ input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+ return input_lengths
+
+ def _get_feature_vector_attention_mask(
+ self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+ ):
+ # Effectively attention_mask.sum(-1), but not inplace to be able to run
+ # on inference mode.
+ non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+ output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+ output_lengths = output_lengths.to(torch.long)
+
+ batch_size = attention_mask.shape[0]
+
+ attention_mask = torch.zeros(
+ (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+ )
+ # these two operations makes sure that all values before the output lengths idxs are attended to
+ attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+ attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+ return attention_mask
+
+
+def _compute_mask_indices(
+ shape: Tuple[int, int],
+ mask_prob: float,
+ mask_length: int,
+ attention_mask: Optional[torch.LongTensor] = None,
+ min_masks: int = 0,
+) -> np.ndarray:
+ """
+ Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+ ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+ CPU as part of the preprocessing during training.
+
+ Args:
+ shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+ the first element is the batch size and the second element is the length of the axis to span.
+ mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+ independently generated mask spans of length `mask_length` is computed by
+ `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+ actual percentage will be smaller.
+ mask_length: size of the mask
+ min_masks: minimum number of masked spans
+ attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+ each batch dimension.
+ """
+ batch_size, sequence_length = shape
+
+ if mask_length < 1:
+ raise ValueError("`mask_length` has to be bigger than 0.")
+
+ if mask_length > sequence_length:
+ raise ValueError(
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+ f" and `sequence_length`: {sequence_length}`"
+ )
+
+ # epsilon is used for probabilistic rounding
+ epsilon = np.random.rand(1).item()
+
+ def compute_num_masked_span(input_length):
+ """Given input length, compute how many spans should be masked"""
+ num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+ num_masked_span = max(num_masked_span, min_masks)
+
+ # make sure num masked span <= sequence_length
+ if num_masked_span * mask_length > sequence_length:
+ num_masked_span = sequence_length // mask_length
+
+ # make sure num_masked span is also <= input_length - (mask_length - 1)
+ if input_length - (mask_length - 1) < num_masked_span:
+ num_masked_span = max(input_length - (mask_length - 1), 0)
+
+ return num_masked_span
+
+ # compute number of masked spans in batch
+ input_lengths = (
+ attention_mask.detach().sum(-1).tolist()
+ if attention_mask is not None
+ else [sequence_length for _ in range(batch_size)]
+ )
+
+ # SpecAugment mask to fill
+ spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+ spec_aug_mask_idxs = []
+
+ max_num_masked_span = compute_num_masked_span(sequence_length)
+
+ if max_num_masked_span == 0:
+ return spec_aug_mask
+
+ for input_length in input_lengths:
+ # compute num of masked spans for this input
+ num_masked_span = compute_num_masked_span(input_length)
+
+ # get random indices to mask
+ spec_aug_mask_idx = np.random.choice(
+ np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+ )
+
+ # pick first sampled index that will serve as a dummy index to pad vector
+ # to ensure same dimension for all batches due to probabilistic rounding
+ # Picking first sample just pads those vectors twice.
+ if len(spec_aug_mask_idx) == 0:
+ # this case can only happen if `input_length` is strictly smaller then
+ # `sequence_length` in which case the last token has to be a padding
+ # token which we can use as a dummy mask id
+ dummy_mask_idx = sequence_length - 1
+ else:
+ dummy_mask_idx = spec_aug_mask_idx[0]
+
+ spec_aug_mask_idx = np.concatenate(
+ [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+ )
+ spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+ spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+ # expand masked indices to masked spans
+ spec_aug_mask_idxs = np.broadcast_to(
+ spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+ )
+ spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+ # add offset to the starting indexes so that indexes now create a span
+ offsets = np.arange(mask_length)[None, None, :]
+ offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+ batch_size, max_num_masked_span * mask_length
+ )
+ spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+ # ensure that we cannot have indices larger than sequence_length
+ if spec_aug_mask_idxs.max() > sequence_length - 1:
+ spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+ # scatter indices to mask
+ np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+ return spec_aug_mask
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+
+DATA2VEC_AUDIO_START_DOCSTRING = r"""
+ Data2VecAudio was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
+ Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
+ Michael Auli.
+
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving etc.).
+
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`Data2VecAudioConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DATA2VEC_AUDIO_INPUTS_DOCSTRING = r"""
+ Args:
+ input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+ Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+ into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+ soundfile*). To prepare the array into *input_values*, the [`AutoProcessor`] should be used for padding and
+ conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+ 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+
+
+ `attention_mask` should be passed if the corresponding processor has `config.return_attention_mask ==
+ True`, which is the case for all pre-trained Data2Vec Audio models. Be aware that that even with
+ `attention_mask`, zero-padded inputs will have slightly different outputs compared to non-padded inputs
+ because there are more than one convolutional layer in the positional encodings. For a more detailed
+ explanation, see [here](https://github.com/huggingface/transformers/issues/25621#issuecomment-1713759349).
+
+
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+Data2VecAudioBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+ "The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioModel(Data2VecAudioPreTrainedModel):
+ def __init__(self, config: Data2VecAudioConfig):
+ super().__init__(config)
+ self.config = config
+ self.feature_extractor = Data2VecAudioFeatureEncoder(config)
+ self.feature_projection = Data2VecAudioFeatureProjection(config)
+
+ # model only needs masking vector if mask prob is > 0.0
+ if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+ self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+ self.encoder = Data2VecAudioEncoder(config)
+
+ self.adapter = Data2VecAudioAdapter(config) if config.add_adapter else None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def freeze_feature_encoder(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ self.feature_extractor._freeze_parameters()
+
+ def _mask_hidden_states(
+ self,
+ hidden_states: torch.FloatTensor,
+ mask_time_indices: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.LongTensor] = None,
+ ):
+ """
+ Masks extracted features along time axis and/or along feature axis according to
+ [SpecAugment](https://arxiv.org/abs/1904.08779).
+ """
+
+ # `config.apply_spec_augment` can set masking to False
+ if not getattr(self.config, "apply_spec_augment", True):
+ return hidden_states
+
+ # generate indices & apply SpecAugment along time axis
+ batch_size, sequence_length, hidden_size = hidden_states.size()
+
+ if mask_time_indices is not None:
+ # apply SpecAugment along time axis with given mask_time_indices
+ hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+ elif self.config.mask_time_prob > 0 and self.training:
+ mask_time_indices = _compute_mask_indices(
+ (batch_size, sequence_length),
+ mask_prob=self.config.mask_time_prob,
+ mask_length=self.config.mask_time_length,
+ attention_mask=attention_mask,
+ min_masks=self.config.mask_time_min_masks,
+ )
+ mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+ hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+ if self.config.mask_feature_prob > 0 and self.training:
+ # generate indices & apply SpecAugment along feature axis
+ mask_feature_indices = _compute_mask_indices(
+ (batch_size, hidden_size),
+ mask_prob=self.config.mask_feature_prob,
+ mask_length=self.config.mask_feature_length,
+ min_masks=self.config.mask_feature_min_masks,
+ )
+ mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+ mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+ hidden_states[mask_feature_indices] = 0
+
+ return hidden_states
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=Data2VecAudioBaseModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="audio",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def forward(
+ self,
+ input_values: Optional[torch.Tensor],
+ attention_mask: Optional[torch.Tensor] = None,
+ mask_time_indices: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, Data2VecAudioBaseModelOutput]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ extract_features = self.feature_extractor(input_values)
+ extract_features = extract_features.transpose(1, 2)
+
+ if attention_mask is not None:
+ # compute reduced attention_mask corresponding to feature vectors
+ attention_mask = self._get_feature_vector_attention_mask(
+ extract_features.shape[1], attention_mask, add_adapter=False
+ )
+
+ hidden_states, extract_features = self.feature_projection(extract_features)
+ hidden_states = self._mask_hidden_states(
+ hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+ )
+
+ encoder_outputs = self.encoder(
+ hidden_states,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = encoder_outputs[0]
+
+ if self.adapter is not None:
+ hidden_states = self.adapter(hidden_states)
+
+ if not return_dict:
+ return (hidden_states, extract_features) + encoder_outputs[1:]
+
+ return Data2VecAudioBaseModelOutput(
+ last_hidden_state=hidden_states,
+ extract_features=extract_features,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 66.95
+
+
+@add_start_docstrings(
+ """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.data2vec_audio = Data2VecAudioModel(config)
+ self.dropout = nn.Dropout(config.final_dropout)
+
+ if config.vocab_size is None:
+ raise ValueError(
+ f"You are trying to instantiate {self.__class__} with a configuration that "
+ "does not define the vocabulary size of the language model head. Please "
+ "instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+ "or define `vocab_size` of your model's configuration."
+ )
+ output_hidden_size = (
+ config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+ )
+ self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def freeze_feature_extractor(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ warnings.warn(
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
+ FutureWarning,
+ )
+ self.freeze_feature_encoder()
+
+ def freeze_feature_encoder(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ self.data2vec_audio.feature_extractor._freeze_parameters()
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=CausalLMOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_CTC_EXPECTED_OUTPUT,
+ expected_loss=_CTC_EXPECTED_LOSS,
+ )
+ def forward(
+ self,
+ input_values: Optional[torch.Tensor],
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: Optional[torch.Tensor] = None,
+ ) -> Union[Tuple, CausalLMOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+ Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+ the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+ All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+ config.vocab_size - 1]`.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if labels is not None and labels.max() >= self.config.vocab_size:
+ raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+ outputs = self.data2vec_audio(
+ input_values,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ hidden_states = self.dropout(hidden_states)
+
+ logits = self.lm_head(hidden_states)
+
+ loss = None
+ if labels is not None:
+ # retrieve loss input_lengths from attention_mask
+ attention_mask = (
+ attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+ )
+ input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+ # assuming that padded tokens are filled with -100
+ # when not being attended to
+ labels_mask = labels >= 0
+ target_lengths = labels_mask.sum(-1)
+ flattened_targets = labels.masked_select(labels_mask)
+
+ # ctc_loss doesn't support fp16
+ log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+ with torch.backends.cudnn.flags(enabled=False):
+ loss = nn.functional.ctc_loss(
+ log_probs,
+ flattened_targets,
+ input_lengths,
+ target_lengths,
+ blank=self.config.pad_token_id,
+ reduction=self.config.ctc_loss_reduction,
+ zero_infinity=self.config.ctc_zero_infinity,
+ )
+
+ if not return_dict:
+ output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+ return ((loss,) + output) if loss is not None else output
+
+ return CausalLMOutput(
+ loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+ )
+
+
+@add_start_docstrings(
+ """
+ Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
+ like SUPERB Keyword Spotting.
+ """,
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ if hasattr(config, "add_adapter") and config.add_adapter:
+ raise ValueError(
+ "Sequence classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
+ )
+ self.data2vec_audio = Data2VecAudioModel(config)
+ num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
+ if config.use_weighted_layer_sum:
+ self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+ self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+ self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def freeze_feature_extractor(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+ not be updated during training.
+ """
+ warnings.warn(
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
+ FutureWarning,
+ )
+ self.freeze_feature_encoder()
+
+ def freeze_feature_encoder(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ self.data2vec_audio.feature_extractor._freeze_parameters()
+
+ def freeze_base_model(self):
+ """
+ Calling this function will disable the gradient computation for the base model so that its parameters will not
+ be updated during training. Only the classification head will be updated.
+ """
+ for param in self.data2vec_audio.parameters():
+ param.requires_grad = False
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=SequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="audio",
+ )
+ def forward(
+ self,
+ input_values: Optional[torch.Tensor],
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: Optional[torch.Tensor] = None,
+ ) -> Union[Tuple, SequenceClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+ outputs = self.data2vec_audio(
+ input_values,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ if self.config.use_weighted_layer_sum:
+ hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+ hidden_states = torch.stack(hidden_states, dim=1)
+ norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+ hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+ else:
+ hidden_states = outputs[0]
+
+ hidden_states = self.projector(hidden_states)
+ if attention_mask is None:
+ pooled_output = hidden_states.mean(dim=1)
+ else:
+ padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+ expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+ hidden_states[~expand_padding_mask] = 0.0
+ pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+ logits = self.classifier(pooled_output)
+
+ loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+ if not return_dict:
+ output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
+ """,
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ if hasattr(config, "add_adapter") and config.add_adapter:
+ raise ValueError(
+ "Audio frame classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
+ )
+ self.data2vec_audio = Data2VecAudioModel(config)
+ num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
+ if config.use_weighted_layer_sum:
+ self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+ self.num_labels = config.num_labels
+
+ self.init_weights()
+
+ def freeze_feature_extractor(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ warnings.warn(
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
+ FutureWarning,
+ )
+ self.freeze_feature_encoder()
+
+ def freeze_feature_encoder(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ self.data2vec_audio.feature_extractor._freeze_parameters()
+
+ def freeze_base_model(self):
+ """
+ Calling this function will disable the gradient computation for the base model so that its parameters will not
+ be updated during training. Only the classification head will be updated.
+ """
+ for param in self.data2vec_audio.parameters():
+ param.requires_grad = False
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TokenClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="audio",
+ )
+ def forward(
+ self,
+ input_values: Optional[torch.Tensor],
+ attention_mask: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, TokenClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+ outputs = self.data2vec_audio(
+ input_values,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ if self.config.use_weighted_layer_sum:
+ hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+ hidden_states = torch.stack(hidden_states, dim=1)
+ norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+ hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+ else:
+ hidden_states = outputs[0]
+
+ logits = self.classifier(hidden_states)
+
+ loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+ if not return_dict:
+ output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+ return output
+
+ return TokenClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+class AMSoftmaxLoss(nn.Module):
+ def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+ super(AMSoftmaxLoss, self).__init__()
+ self.scale = scale
+ self.margin = margin
+ self.num_labels = num_labels
+ self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+ self.loss = nn.CrossEntropyLoss()
+
+ def forward(self, hidden_states, labels):
+ labels = labels.flatten()
+ weight = nn.functional.normalize(self.weight, dim=0)
+ hidden_states = nn.functional.normalize(hidden_states, dim=1)
+ cos_theta = torch.mm(hidden_states, weight)
+ psi = cos_theta - self.margin
+
+ onehot = nn.functional.one_hot(labels, self.num_labels)
+ logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+ loss = self.loss(logits, labels)
+
+ return loss
+
+
+class TDNNLayer(nn.Module):
+ def __init__(self, config, layer_id=0):
+ super().__init__()
+ self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+ self.out_conv_dim = config.tdnn_dim[layer_id]
+ self.kernel_size = config.tdnn_kernel[layer_id]
+ self.dilation = config.tdnn_dilation[layer_id]
+
+ self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+ self.activation = nn.ReLU()
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ if is_peft_available():
+ from peft.tuners.lora import LoraLayer
+
+ if is_peft_available():
+ if isinstance(self.kernel, LoraLayer):
+ warnings.warn(
+ "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+ "You should exclude TDNNLayer from LoRA's target modules.",
+ )
+
+ # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+ hidden_states = hidden_states.transpose(1, 2)
+ weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+ hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+ hidden_states = hidden_states.transpose(1, 2)
+
+ hidden_states = self.activation(hidden_states)
+ return hidden_states
+
+
+@add_start_docstrings(
+ """
+ Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+ """,
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.data2vec_audio = Data2VecAudioModel(config)
+ num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
+ if config.use_weighted_layer_sum:
+ self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+ self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+ tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+ self.tdnn = nn.ModuleList(tdnn_layers)
+
+ self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+ self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+ self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+ self.init_weights()
+
+ def freeze_feature_extractor(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ warnings.warn(
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
+ FutureWarning,
+ )
+ self.freeze_feature_encoder()
+
+ def freeze_feature_encoder(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ self.data2vec_audio.feature_extractor._freeze_parameters()
+
+ def freeze_base_model(self):
+ """
+ Calling this function will disable the gradient computation for the base model so that its parameters will not
+ be updated during training. Only the classification head will be updated.
+ """
+ for param in self.data2vec_audio.parameters():
+ param.requires_grad = False
+
+ def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+ """
+ Computes the output length of the TDNN layers
+ """
+
+ def _conv_out_length(input_length, kernel_size, stride):
+ # 1D convolutional layer output length formula taken
+ # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+ return (input_length - kernel_size) // stride + 1
+
+ for kernel_size in self.config.tdnn_kernel:
+ input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+ return input_lengths
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=XVectorOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="audio",
+ )
+ def forward(
+ self,
+ input_values: Optional[torch.Tensor],
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: Optional[torch.Tensor] = None,
+ ) -> Union[Tuple, XVectorOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+ outputs = self.data2vec_audio(
+ input_values,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ if self.config.use_weighted_layer_sum:
+ hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+ hidden_states = torch.stack(hidden_states, dim=1)
+ norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+ hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+ else:
+ hidden_states = outputs[0]
+
+ hidden_states = self.projector(hidden_states)
+
+ for tdnn_layer in self.tdnn:
+ hidden_states = tdnn_layer(hidden_states)
+
+ # Statistic Pooling
+ if attention_mask is None:
+ mean_features = hidden_states.mean(dim=1)
+ std_features = hidden_states.std(dim=1)
+ else:
+ feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+ tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+ mean_features = []
+ std_features = []
+ for i, length in enumerate(tdnn_output_lengths):
+ mean_features.append(hidden_states[i, :length].mean(dim=0))
+ std_features.append(hidden_states[i, :length].std(dim=0))
+ mean_features = torch.stack(mean_features)
+ std_features = torch.stack(std_features)
+ statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+ output_embeddings = self.feature_extractor(statistic_pooling)
+ logits = self.classifier(output_embeddings)
+
+ loss = None
+ if labels is not None:
+ loss = self.objective(logits, labels)
+
+ if not return_dict:
+ output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+ return ((loss,) + output) if loss is not None else output
+
+ return XVectorOutput(
+ loss=loss,
+ logits=logits,
+ embeddings=output_embeddings,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = [
+ "Data2VecAudioForAudioFrameClassification",
+ "Data2VecAudioForCTC",
+ "Data2VecAudioForSequenceClassification",
+ "Data2VecAudioForXVector",
+ "Data2VecAudioModel",
+ "Data2VecAudioPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/modeling_data2vec_text.py b/docs/transformers/build/lib/transformers/models/data2vec/modeling_data2vec_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f84eca754e85b1c458b2d581d87021d62d964e3
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/modeling_data2vec_text.py
@@ -0,0 +1,1553 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Data2VecText model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, gelu
+from ...generation import GenerationMixin
+from ...modeling_outputs import (
+ BaseModelOutputWithPastAndCrossAttentions,
+ BaseModelOutputWithPoolingAndCrossAttentions,
+ CausalLMOutputWithCrossAttentions,
+ MaskedLMOutput,
+ MultipleChoiceModelOutput,
+ QuestionAnsweringModelOutput,
+ SequenceClassifierOutput,
+ TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_data2vec_text import Data2VecTextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-text-base"
+_CONFIG_FOR_DOC = "Data2VecTextConfig"
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Data2VecText
+class Data2VecTextForTextEmbeddings(nn.Module):
+ """
+ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+ """
+
+ # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+ def __init__(self, config):
+ super().__init__()
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+ # any TensorFlow checkpoint file
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+ self.register_buffer(
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+ )
+ self.register_buffer(
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+ )
+
+ # End copy
+ self.padding_idx = config.pad_token_id
+ self.position_embeddings = nn.Embedding(
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+ )
+
+ def forward(
+ self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+ ):
+ if position_ids is None:
+ if input_ids is not None:
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+ else:
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+ if input_ids is not None:
+ input_shape = input_ids.size()
+ else:
+ input_shape = inputs_embeds.size()[:-1]
+
+ seq_length = input_shape[1]
+
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+ # issue #5664
+ if token_type_ids is None:
+ if hasattr(self, "token_type_ids"):
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+ token_type_ids = buffered_token_type_ids_expanded
+ else:
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.word_embeddings(input_ids)
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+ embeddings = inputs_embeds + token_type_embeddings
+ if self.position_embedding_type == "absolute":
+ position_embeddings = self.position_embeddings(position_ids)
+ embeddings += position_embeddings
+ embeddings = self.LayerNorm(embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+ """
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+ Args:
+ inputs_embeds: torch.Tensor
+
+ Returns: torch.Tensor
+ """
+ input_shape = inputs_embeds.size()[:-1]
+ sequence_length = input_shape[1]
+
+ position_ids = torch.arange(
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+ )
+ return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Data2VecText
+class Data2VecTextSelfAttention(nn.Module):
+ def __init__(self, config, position_embedding_type=None):
+ super().__init__()
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+ raise ValueError(
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+ f"heads ({config.num_attention_heads})"
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+ self.position_embedding_type = position_embedding_type or getattr(
+ config, "position_embedding_type", "absolute"
+ )
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+ self.max_position_embeddings = config.max_position_embeddings
+ self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+ self.is_decoder = config.is_decoder
+
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+ x = x.view(new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.Tensor]:
+ mixed_query_layer = self.query(hidden_states)
+
+ # If this is instantiated as a cross-attention module, the keys
+ # and values come from an encoder; the attention mask needs to be
+ # such that the encoder's padding tokens are not attended to.
+ is_cross_attention = encoder_hidden_states is not None
+
+ if is_cross_attention and past_key_value is not None:
+ # reuse k,v, cross_attentions
+ key_layer = past_key_value[0]
+ value_layer = past_key_value[1]
+ attention_mask = encoder_attention_mask
+ elif is_cross_attention:
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+ attention_mask = encoder_attention_mask
+ elif past_key_value is not None:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+ else:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+
+ use_cache = past_key_value is not None
+ if self.is_decoder:
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_layer, value_layer)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+ query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+ if use_cache:
+ position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+ -1, 1
+ )
+ else:
+ position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+ position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+ distance = position_ids_l - position_ids_r
+
+ positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+ positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
+
+ if self.position_embedding_type == "relative_key":
+ relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+ attention_scores = attention_scores + relative_position_scores
+ elif self.position_embedding_type == "relative_key_query":
+ relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+ relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+ attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+ if attention_mask is not None:
+ # Apply the attention mask is (precomputed for all layers in Data2VecTextModel forward() function)
+ attention_scores = attention_scores + attention_mask
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(attention_probs)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = attention_probs * head_mask
+
+ context_layer = torch.matmul(attention_probs, value_layer)
+
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(new_context_layer_shape)
+
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+ if self.is_decoder:
+ outputs = outputs + (past_key_value,)
+ return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class Data2VecTextSelfOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+DATA2VEC_TEXT_SELF_ATTENTION_CLASSES = {
+ "eager": Data2VecTextSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2VecText,BERT->DATA2VEC_TEXT
+class Data2VecTextAttention(nn.Module):
+ def __init__(self, config, position_embedding_type=None):
+ super().__init__()
+ self.self = DATA2VEC_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+ config, position_embedding_type=position_embedding_type
+ )
+ self.output = Data2VecTextSelfOutput(config)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+ )
+
+ # Prune linear layers
+ self.self.query = prune_linear_layer(self.self.query, index)
+ self.self.key = prune_linear_layer(self.self.key, index)
+ self.self.value = prune_linear_layer(self.self.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.Tensor]:
+ self_outputs = self.self(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions,
+ )
+ attention_output = self.output(self_outputs[0], hidden_states)
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+ return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class Data2VecTextIntermediate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class Data2VecTextOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Data2VecText
+class Data2VecTextLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ self.seq_len_dim = 1
+ self.attention = Data2VecTextAttention(config)
+ self.is_decoder = config.is_decoder
+ self.add_cross_attention = config.add_cross_attention
+ if self.add_cross_attention:
+ if not self.is_decoder:
+ raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+ self.crossattention = Data2VecTextAttention(config, position_embedding_type="absolute")
+ self.intermediate = Data2VecTextIntermediate(config)
+ self.output = Data2VecTextOutput(config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.Tensor]:
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+ self_attention_outputs = self.attention(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ output_attentions=output_attentions,
+ past_key_value=self_attn_past_key_value,
+ )
+ attention_output = self_attention_outputs[0]
+
+ # if decoder, the last output is tuple of self-attn cache
+ if self.is_decoder:
+ outputs = self_attention_outputs[1:-1]
+ present_key_value = self_attention_outputs[-1]
+ else:
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
+
+ cross_attn_present_key_value = None
+ if self.is_decoder and encoder_hidden_states is not None:
+ if not hasattr(self, "crossattention"):
+ raise ValueError(
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+ " by setting `config.add_cross_attention=True`"
+ )
+
+ # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+ cross_attention_outputs = self.crossattention(
+ attention_output,
+ attention_mask,
+ head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ cross_attn_past_key_value,
+ output_attentions,
+ )
+ attention_output = cross_attention_outputs[0]
+ outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
+
+ # add cross-attn cache to positions 3,4 of present_key_value tuple
+ cross_attn_present_key_value = cross_attention_outputs[-1]
+ present_key_value = present_key_value + cross_attn_present_key_value
+
+ layer_output = apply_chunking_to_forward(
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+ )
+ outputs = (layer_output,) + outputs
+
+ # if decoder, return the attn key/values as the last output
+ if self.is_decoder:
+ outputs = outputs + (present_key_value,)
+
+ return outputs
+
+ def feed_forward_chunk(self, attention_output):
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+ return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Data2VecText
+class Data2VecTextEncoder(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.layer = nn.ModuleList([Data2VecTextLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = False,
+ output_hidden_states: Optional[bool] = False,
+ return_dict: Optional[bool] = True,
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ next_decoder_cache = () if use_cache else None
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+ past_key_value = past_key_values[i] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ hidden_states,
+ attention_mask,
+ layer_head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions,
+ )
+ else:
+ layer_outputs = layer_module(
+ hidden_states,
+ attention_mask,
+ layer_head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+ if use_cache:
+ next_decoder_cache += (layer_outputs[-1],)
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+ if self.config.add_cross_attention:
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(
+ v
+ for v in [
+ hidden_states,
+ next_decoder_cache,
+ all_hidden_states,
+ all_self_attentions,
+ all_cross_attentions,
+ ]
+ if v is not None
+ )
+ return BaseModelOutputWithPastAndCrossAttentions(
+ last_hidden_state=hidden_states,
+ past_key_values=next_decoder_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ cross_attentions=all_cross_attentions,
+ )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class Data2VecTextPooler(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.activation = nn.Tanh()
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ first_token_tensor = hidden_states[:, 0]
+ pooled_output = self.dense(first_token_tensor)
+ pooled_output = self.activation(pooled_output)
+ return pooled_output
+
+
+class Data2VecTextPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = Data2VecTextConfig
+ base_model_prefix = "data2vec_text"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["Data2VecTextForTextEmbeddings", "Data2VecTextLayer"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, nn.Linear):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ if hasattr(module, "bias") and module.bias is not None:
+ module.bias.data.zero_()
+ if hasattr(module, "weight") and module.weight is not None:
+ module.weight.data.fill_(1.0)
+
+
+DATA2VECTEXT_START_DOCSTRING = r"""
+ Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
+ Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
+ Michael Auli.
+
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`Data2VecTextConfig`]): Model configuration class with all the parameters of the
+ model. Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DATA2VECTEXT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `({0})`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
+
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
+
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare Data2VecText Model for text transformer outputting raw hidden-states without any specific head on top.",
+ DATA2VECTEXT_START_DOCSTRING,
+)
+class Data2VecTextModel(Data2VecTextPreTrainedModel):
+ """
+
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+ Kaiser and Illia Polosukhin.
+
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+ """
+
+ def __init__(self, config, add_pooling_layer=True):
+ super().__init__(config)
+ self.config = config
+
+ self.embeddings = Data2VecTextForTextEmbeddings(config)
+ self.encoder = Data2VecTextEncoder(config)
+
+ self.pooler = Data2VecTextPooler(config) if add_pooling_layer else None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embeddings.word_embeddings
+
+ def set_input_embeddings(self, value):
+ self.embeddings.word_embeddings = value
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+ r"""
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+ the model is configured as a decoder.
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if self.config.is_decoder:
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ else:
+ use_cache = False
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+ input_shape = input_ids.size()
+ elif inputs_embeds is not None:
+ input_shape = inputs_embeds.size()[:-1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ batch_size, seq_length = input_shape
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+ # past_key_values_length
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+ if attention_mask is None:
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+ if token_type_ids is None:
+ if hasattr(self.embeddings, "token_type_ids"):
+ buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+ token_type_ids = buffered_token_type_ids_expanded
+ else:
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+ # If a 2D or 3D attention mask is provided for the cross-attention
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ if self.config.is_decoder and encoder_hidden_states is not None:
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+ if encoder_attention_mask is None:
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+ else:
+ encoder_extended_attention_mask = None
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ position_ids=position_ids,
+ token_type_ids=token_type_ids,
+ inputs_embeds=inputs_embeds,
+ past_key_values_length=past_key_values_length,
+ )
+ encoder_outputs = self.encoder(
+ embedding_output,
+ attention_mask=extended_attention_mask,
+ head_mask=head_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_extended_attention_mask,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ sequence_output = encoder_outputs[0]
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+ if not return_dict:
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPoolingAndCrossAttentions(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ past_key_values=encoder_outputs.past_key_values,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ cross_attentions=encoder_outputs.cross_attentions,
+ )
+
+
+@add_start_docstrings(
+ """Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
+)
+class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ if not config.is_decoder:
+ logger.warning("If you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+ self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
+ self.lm_head = Data2VecTextLMHead(config)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_output_embeddings(self):
+ return self.lm_head.decoder
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head.decoder = new_embeddings
+
+ @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+ r"""
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+ the model is configured as a decoder.
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, Data2VecTextForCausalLM, Data2VecTextConfig
+ >>> import torch
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("facebook/data2vec-text-base")
+ >>> config = Data2VecTextConfig.from_pretrained("facebook/data2vec-text-base")
+ >>> config.is_decoder = True
+ >>> model = Data2VecTextForCausalLM.from_pretrained("facebook/data2vec-text-base", config=config)
+
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
+
+ >>> prediction_logits = outputs.logits
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ if labels is not None:
+ use_cache = False
+
+ outputs = self.data2vec_text(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+ prediction_scores = self.lm_head(sequence_output)
+
+ lm_loss = None
+ if labels is not None:
+ lm_loss = self.loss_function(
+ prediction_scores,
+ labels,
+ vocab_size=self.config.vocab_size,
+ **kwargs,
+ )
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return ((lm_loss,) + output) if lm_loss is not None else output
+
+ return CausalLMOutputWithCrossAttentions(
+ loss=lm_loss,
+ logits=prediction_scores,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ cross_attentions=outputs.cross_attentions,
+ )
+
+ def _reorder_cache(self, past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+
+@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING)
+class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ if config.is_decoder:
+ logger.warning(
+ "If you want to use `Data2VecTextForMaskedLM` make sure `config.is_decoder=False` for "
+ "bi-directional self-attention."
+ )
+
+ self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
+ self.lm_head = Data2VecTextLMHead(config)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_output_embeddings(self):
+ return self.lm_head.decoder
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head.decoder = new_embeddings
+
+ @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=MaskedLMOutput,
+ config_class=_CONFIG_FOR_DOC,
+ mask="",
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, MaskedLMOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
+ Used to hide legacy arguments that have been deprecated.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.data2vec_text(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ sequence_output = outputs[0]
+ prediction_scores = self.lm_head(sequence_output)
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+
+ labels = labels.to(prediction_scores.device)
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return MaskedLMOutput(
+ loss=masked_lm_loss,
+ logits=prediction_scores,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Data2VecText
+class Data2VecTextLMHead(nn.Module):
+ """Data2VecText Head for masked language modeling."""
+
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+ self.decoder.bias = self.bias
+
+ def forward(self, features, **kwargs):
+ x = self.dense(features)
+ x = gelu(x)
+ x = self.layer_norm(x)
+
+ # project back to size of vocabulary with bias
+ x = self.decoder(x)
+
+ return x
+
+ def _tie_weights(self):
+ # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+ # For accelerate compatibility and to not break backward compatibility
+ if self.decoder.bias.device.type == "meta":
+ self.decoder.bias = self.bias
+ else:
+ self.bias = self.decoder.bias
+
+
+@add_start_docstrings(
+ """
+ Data2VecText Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+ pooled output) e.g. for GLUE tasks.
+ """,
+ DATA2VECTEXT_START_DOCSTRING,
+)
+class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.config = config
+
+ self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
+ self.classifier = Data2VecTextClassificationHead(config)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=SequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.data2vec_text(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ sequence_output = outputs[0]
+ logits = self.classifier(sequence_output)
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ Data2VecText Model with a multiple choice classification head on top (a linear layer on top of the pooled output
+ and a softmax) e.g. for RocStories/SWAG tasks.
+ """,
+ DATA2VECTEXT_START_DOCSTRING,
+)
+class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.data2vec_text = Data2VecTextModel(config)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.classifier = nn.Linear(config.hidden_size, 1)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(
+ DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+ )
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=MultipleChoiceModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, MultipleChoiceModelOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+ flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+ flat_inputs_embeds = (
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+ if inputs_embeds is not None
+ else None
+ )
+
+ outputs = self.data2vec_text(
+ flat_input_ids,
+ position_ids=flat_position_ids,
+ token_type_ids=flat_token_type_ids,
+ attention_mask=flat_attention_mask,
+ head_mask=head_mask,
+ inputs_embeds=flat_inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ pooled_output = outputs[1]
+
+ pooled_output = self.dropout(pooled_output)
+ logits = self.classifier(pooled_output)
+ reshaped_logits = logits.view(-1, num_choices)
+
+ loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+
+ labels = labels.to(reshaped_logits.device)
+ loss = loss_fct(reshaped_logits, labels)
+
+ if not return_dict:
+ output = (reshaped_logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return MultipleChoiceModelOutput(
+ loss=loss,
+ logits=reshaped_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+ for Named-Entity-Recognition (NER) tasks.
+ """,
+ DATA2VECTEXT_START_DOCSTRING,
+)
+class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+
+ self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
+ classifier_dropout = (
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+ )
+ self.dropout = nn.Dropout(classifier_dropout)
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TokenClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, TokenClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.data2vec_text(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+
+ sequence_output = self.dropout(sequence_output)
+ logits = self.classifier(sequence_output)
+
+ loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+
+ labels = labels.to(logits.device)
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TokenClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Data2VecText
+class Data2VecTextClassificationHead(nn.Module):
+ """Head for sentence-level classification tasks."""
+
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ classifier_dropout = (
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+ )
+ self.dropout = nn.Dropout(classifier_dropout)
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+ def forward(self, features, **kwargs):
+ x = features[:, 0, :] # take token (equiv. to [CLS])
+ x = self.dropout(x)
+ x = self.dense(x)
+ x = torch.tanh(x)
+ x = self.dropout(x)
+ x = self.out_proj(x)
+ return x
+
+
+@add_start_docstrings(
+ """
+ Data2VecText Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+ linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ """,
+ DATA2VECTEXT_START_DOCSTRING,
+)
+class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+
+ self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=QuestionAnsweringModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ start_positions: Optional[torch.LongTensor] = None,
+ end_positions: Optional[torch.LongTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+ r"""
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.data2vec_text(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+
+ logits = self.qa_outputs(sequence_output)
+ start_logits, end_logits = logits.split(1, dim=-1)
+ start_logits = start_logits.squeeze(-1).contiguous()
+ end_logits = end_logits.squeeze(-1).contiguous()
+
+ total_loss = None
+ if start_positions is not None and end_positions is not None:
+ # If we are on multi-GPU, split add a dimension
+ if len(start_positions.size()) > 1:
+ start_positions = start_positions.squeeze(-1)
+ if len(end_positions.size()) > 1:
+ end_positions = end_positions.squeeze(-1)
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
+ ignored_index = start_logits.size(1)
+ start_positions = start_positions.clamp(0, ignored_index)
+ end_positions = end_positions.clamp(0, ignored_index)
+
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+ start_loss = loss_fct(start_logits, start_positions)
+ end_loss = loss_fct(end_logits, end_positions)
+ total_loss = (start_loss + end_loss) / 2
+
+ if not return_dict:
+ output = (start_logits, end_logits) + outputs[2:]
+ return ((total_loss,) + output) if total_loss is not None else output
+
+ return QuestionAnsweringModelOutput(
+ loss=total_loss,
+ start_logits=start_logits,
+ end_logits=end_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+ """
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+ are ignored. This is modified from fairseq's `utils.make_positions`.
+
+ Args:
+ x: torch.Tensor x:
+
+ Returns: torch.Tensor
+ """
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+ mask = input_ids.ne(padding_idx).int()
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+ return incremental_indices.long() + padding_idx
+
+
+__all__ = [
+ "Data2VecTextForCausalLM",
+ "Data2VecTextForMaskedLM",
+ "Data2VecTextForMultipleChoice",
+ "Data2VecTextForQuestionAnswering",
+ "Data2VecTextForSequenceClassification",
+ "Data2VecTextForTokenClassification",
+ "Data2VecTextModel",
+ "Data2VecTextPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/modeling_data2vec_vision.py b/docs/transformers/build/lib/transformers/models/data2vec/modeling_data2vec_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ea0ffd39d47f75e3b761db62120af79c121a3a1
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -0,0 +1,1449 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Data2VecVision model."""
+
+import collections.abc
+import math
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+ BaseModelOutput,
+ BaseModelOutputWithPooling,
+ ImageClassifierOutput,
+ SemanticSegmenterOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import compile_compatible_method_lru_cache, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+ torch_int,
+)
+from .configuration_data2vec_vision import Data2VecVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "Data2VecVisionConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/data2vec-vision-base-ft1k"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "remote control, remote"
+
+
+@dataclass
+# Copied from transformers.models.beit.modeling_beit.BeitModelOutputWithPooling with Beit->Data2VecVision
+class Data2VecVisionModelOutputWithPooling(BaseModelOutputWithPooling):
+ """
+ Class for outputs of [`Data2VecVisionModel`].
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+ Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
+ *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
+ will be returned.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+ """
+ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+ Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+ however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+ layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+ argument.
+ """
+ if drop_prob == 0.0 or not training:
+ return input
+ keep_prob = 1 - drop_prob
+ shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+ random_tensor.floor_() # binarize
+ output = input.div(keep_prob) * random_tensor
+ return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Data2VecVision
+class Data2VecVisionDropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob: Optional[float] = None) -> None:
+ super().__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ return drop_path(hidden_states, self.drop_prob, self.training)
+
+ def extra_repr(self) -> str:
+ return "p={}".format(self.drop_prob)
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitEmbeddings with Beit->Data2VecVision
+class Data2VecVisionEmbeddings(nn.Module):
+ """
+ Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+
+ """
+
+ def __init__(self, config: Data2VecVisionConfig) -> None:
+ super().__init__()
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+ if config.use_mask_token:
+ self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+ else:
+ self.mask_token = None
+ self.patch_embeddings = Data2VecVisionPatchEmbeddings(config)
+ self.patch_size = config.patch_size
+ self.image_size = (
+ config.image_size
+ if isinstance(config.image_size, collections.abc.Iterable)
+ else (config.image_size, config.image_size)
+ )
+ num_patches = self.patch_embeddings.num_patches
+ if config.use_absolute_position_embeddings:
+ self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+ else:
+ self.position_embeddings = None
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ """
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+ images. This method is also adapted to support torch.jit tracing.
+
+ Adapted from:
+ - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+ - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+ """
+
+ num_patches = embeddings.shape[1] - 1
+ num_positions = self.position_embeddings.shape[1] - 1
+
+ # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+ if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+ return self.position_embeddings
+
+ class_pos_embed = self.position_embeddings[:, :1]
+ patch_pos_embed = self.position_embeddings[:, 1:]
+
+ dim = embeddings.shape[-1]
+
+ new_height = height // self.patch_size
+ new_width = width // self.patch_size
+
+ sqrt_num_positions = torch_int(num_positions**0.5)
+ patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed,
+ size=(new_height, new_width),
+ mode="bicubic",
+ align_corners=False,
+ )
+
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+ return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+ def forward(
+ self,
+ pixel_values: torch.Tensor,
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
+ interpolate_pos_encoding: Optional[bool] = None,
+ ) -> torch.Tensor:
+ if self.position_embeddings is not None and interpolate_pos_encoding is not None:
+ warnings.warn(
+ "`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always "
+ "interpolated to the input image size. The argument will be removed in transformers v4.51.0."
+ )
+
+ _, _, height, width = pixel_values.shape
+ embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
+ batch_size, seq_len, _ = embeddings.size()
+
+ if bool_masked_pos is not None:
+ mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+ # replace the masked visual tokens by mask_tokens
+ w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+ embeddings = embeddings * (1 - w) + mask_tokens * w
+
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+ embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+ if self.position_embeddings is not None:
+ embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+ embeddings = self.dropout(embeddings)
+
+ return embeddings, (patch_height, patch_width)
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitPatchEmbeddings with Beit->Data2VecVision
+class Data2VecVisionPatchEmbeddings(nn.Module):
+ """
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+ `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+ Transformer.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ image_size, patch_size = config.image_size, config.patch_size
+ num_channels, hidden_size = config.num_channels, config.hidden_size
+
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.num_patches = num_patches
+ self.patch_shape = patch_shape
+
+ self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+ batch_size, num_channels, height, width = pixel_values.shape
+ if num_channels != self.num_channels:
+ raise ValueError(
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+ )
+
+ embeddings = self.projection(pixel_values)
+ patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+ embeddings = embeddings.flatten(2).transpose(1, 2)
+
+ return embeddings, (patch_height, patch_width)
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitSelfAttention with Beit->Data2VecVision
+class Data2VecVisionSelfAttention(nn.Module):
+ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
+ super().__init__()
+ self.config = config
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+ raise ValueError(
+ f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+ f"heads {config.num_attention_heads}."
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
+ self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+ self.has_relative_position_bias = bool(window_size)
+ if self.has_relative_position_bias:
+ self.relative_position_bias = Data2VecVisionRelativePositionBias(config, window_size=window_size)
+
+ def transpose_for_scores(self, x):
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+ x = x.view(*new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ relative_position_bias: Optional[torch.Tensor] = None,
+ interpolate_pos_encoding: bool = False,
+ resolution: Optional[Tuple[int]] = None,
+ ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
+ mixed_query_layer = self.query(hidden_states)
+
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+ # Add relative position bias if present.
+ if self.has_relative_position_bias:
+ height, width = resolution
+ window_size = (height // self.config.patch_size, width // self.config.patch_size)
+ attention_scores = attention_scores + self.relative_position_bias(
+ window_size, interpolate_pos_encoding, dim_size=hidden_states.shape[1]
+ )
+
+ # Add shared relative position bias if provided.
+ if relative_position_bias is not None:
+ attention_scores = attention_scores + relative_position_bias
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(attention_probs)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = attention_probs * head_mask
+
+ context_layer = torch.matmul(attention_probs, value_layer)
+
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(*new_context_layer_shape)
+
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+ return outputs
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitSdpaSelfAttention with Beit->Data2VecVision
+class Data2VecVisionSdpaSelfAttention(Data2VecVisionSelfAttention):
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ relative_position_bias: Optional[torch.Tensor] = None,
+ interpolate_pos_encoding: bool = False,
+ resolution: Optional[Tuple[int]] = None,
+ ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
+ if output_attentions or head_mask is not None:
+ logger.warning_once(
+ "`Data2VecVisionSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not "
+ "support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, "
+ "but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
+ 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ return super().forward(
+ hidden_states=hidden_states,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ relative_position_bias=relative_position_bias,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ resolution=resolution,
+ )
+
+ mixed_query_layer = self.query(hidden_states)
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+
+ attn_bias = None
+ if self.has_relative_position_bias:
+ height, width = resolution
+ window_size = (height // self.config.patch_size, width // self.config.patch_size)
+ attn_bias = self.relative_position_bias(
+ window_size, interpolate_pos_encoding, dim_size=hidden_states.shape[1]
+ )
+
+ # Add shared relative position bias if provided.
+ if relative_position_bias is not None:
+ if attn_bias is None:
+ attn_bias = relative_position_bias
+ else:
+ attn_bias += relative_position_bias
+
+ scaling = 1 / math.sqrt(self.attention_head_size)
+ context_layer = torch.nn.functional.scaled_dot_product_attention(
+ query_layer,
+ key_layer,
+ value_layer,
+ attn_mask=attn_bias,
+ dropout_p=self.config.attention_probs_dropout_prob if self.training else 0.0,
+ is_causal=False,
+ scale=scaling,
+ )
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(*new_context_layer_shape)
+ return context_layer, None
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitSelfOutput with Beit->Data2VecVision
+class Data2VecVisionSelfOutput(nn.Module):
+ """
+ The residual connection is defined in Data2VecVisionLayer instead of here (as is the case with other models), due to the
+ layernorm applied before each block.
+ """
+
+ def __init__(self, config: Data2VecVisionConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor, gamma=None) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ return hidden_states
+
+
+DATA2VEC_VISION_SELF_ATTENTION_CLASSES = {
+ "eager": Data2VecVisionSelfAttention,
+ "sdpa": Data2VecVisionSdpaSelfAttention,
+}
+
+
+# Copied from tests.models.beit.modeling_beit.BeitAttention with Beit->Data2VecVision, BEIT->DATA2VEC_VISION
+class Data2VecVisionAttention(nn.Module):
+ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
+ super().__init__()
+ self.attention = DATA2VEC_VISION_SELF_ATTENTION_CLASSES[config._attn_implementation](
+ config, window_size=window_size
+ )
+ self.output = Data2VecVisionSelfOutput(config)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+ )
+
+ # Prune linear layers
+ self.attention.query = prune_linear_layer(self.attention.query, index)
+ self.attention.key = prune_linear_layer(self.attention.key, index)
+ self.attention.value = prune_linear_layer(self.attention.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+ self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
+ interpolate_pos_encoding: bool = False,
+ resolution: Optional[Tuple[int]] = None,
+ ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
+ self_outputs = self.attention(
+ hidden_states, head_mask, output_attentions, relative_position_bias, interpolate_pos_encoding, resolution
+ )
+
+ attention_output = self.output(self_outputs[0], hidden_states)
+
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+ return outputs
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitIntermediate with Beit->Data2VecVision
+class Data2VecVisionIntermediate(nn.Module):
+ def __init__(self, config: Data2VecVisionConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+
+ return hidden_states
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitOutput with Beit->Data2VecVision
+class Data2VecVisionOutput(nn.Module):
+ def __init__(self, config: Data2VecVisionConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ return hidden_states
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitLayer with Beit->Data2VecVision,BEiT->Data2VecVision
+class Data2VecVisionLayer(nn.Module):
+ """This corresponds to the Block class in the timm implementation."""
+
+ def __init__(
+ self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, drop_path_rate: float = 0.0
+ ) -> None:
+ super().__init__()
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ self.seq_len_dim = 1
+ self.attention = Data2VecVisionAttention(config, window_size=window_size)
+ self.intermediate = Data2VecVisionIntermediate(config)
+ self.output = Data2VecVisionOutput(config)
+ self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.drop_path = Data2VecVisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+ self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ init_values = config.layer_scale_init_value
+ if init_values > 0:
+ self.lambda_1 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
+ self.lambda_2 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
+ else:
+ self.lambda_1, self.lambda_2 = None, None
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ relative_position_bias: Optional[torch.Tensor] = None,
+ interpolate_pos_encoding: bool = False,
+ resolution: Optional[Tuple[int]] = None,
+ ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
+ self_attention_outputs = self.attention(
+ self.layernorm_before(hidden_states), # in Data2VecVision, layernorm is applied before self-attention
+ head_mask,
+ output_attentions=output_attentions,
+ relative_position_bias=relative_position_bias,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ resolution=resolution,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
+
+ # apply lambda_1 if present
+ if self.lambda_1 is not None:
+ attention_output = self.lambda_1 * attention_output
+
+ # first residual connection
+ hidden_states = self.drop_path(attention_output) + hidden_states
+
+ # in Data2VecVision, layernorm is also applied after self-attention
+ layer_output = self.layernorm_after(hidden_states)
+
+ layer_output = self.intermediate(layer_output)
+ layer_output = self.output(layer_output)
+
+ if self.lambda_2 is not None:
+ layer_output = self.lambda_2 * layer_output
+
+ # second residual connection
+ layer_output = self.drop_path(layer_output) + hidden_states
+
+ outputs = (layer_output,) + outputs
+
+ return outputs
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitRelativePositionBias with Beit->Data2VecVision
+class Data2VecVisionRelativePositionBias(nn.Module):
+ def __init__(self, config: Data2VecVisionConfig, window_size: tuple) -> None:
+ super().__init__()
+ self.window_size = window_size
+ self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+ self.relative_position_bias_table = nn.Parameter(
+ torch.zeros(self.num_relative_distance, config.num_attention_heads)
+ ) # 2*Wh-1 * 2*Ww-1, nH
+ # cls to token & token 2 cls & cls to cls
+
+ @compile_compatible_method_lru_cache(maxsize=10)
+ def generate_relative_position_index(self, window_size: Tuple[int, int]) -> torch.Tensor:
+ """
+ This method creates the relative position index, modified to support arbitrary window sizes,
+ as introduced in [MiDaS v3.1](https://arxiv.org/abs/2307.14460).
+ """
+ num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+ # cls to token & token 2 cls & cls to cls
+ # get pair-wise relative position index for each token inside the window
+ window_area = window_size[0] * window_size[1]
+ grid = torch.meshgrid(torch.arange(window_size[0]), torch.arange(window_size[1]), indexing="ij")
+ coords = torch.stack(grid) # 2, Wh, Ww
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
+ relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
+ relative_coords[:, :, 1] += window_size[1] - 1
+ relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+ relative_position_index = torch.zeros(size=(window_area + 1,) * 2, dtype=relative_coords.dtype)
+ relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
+ relative_position_index[0, 0:] = num_relative_distance - 3
+ relative_position_index[0:, 0] = num_relative_distance - 2
+ relative_position_index[0, 0] = num_relative_distance - 1
+ return relative_position_index
+
+ def forward(self, window_size, interpolate_pos_encoding: bool = False, dim_size=None) -> torch.Tensor:
+ """
+ Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
+ """
+ old_height = 2 * self.window_size[0] - 1
+ old_width = 2 * self.window_size[1] - 1
+
+ new_height = 2 * window_size[0] - 1
+ new_width = 2 * window_size[1] - 1
+
+ old_relative_position_bias_table = self.relative_position_bias_table
+
+ old_num_relative_distance = self.num_relative_distance
+ new_num_relative_distance = new_height * new_width + 3
+
+ old_sub_table = old_relative_position_bias_table[: old_num_relative_distance - 3]
+
+ old_sub_table = old_sub_table.reshape(1, old_width, old_height, -1).permute(0, 3, 1, 2)
+ new_sub_table = nn.functional.interpolate(
+ old_sub_table, size=(torch_int(new_height), torch_int(new_width)), mode="bilinear"
+ )
+ new_sub_table = new_sub_table.permute(0, 2, 3, 1).reshape(new_num_relative_distance - 3, -1)
+
+ new_relative_position_bias_table = torch.cat(
+ [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3 :]]
+ )
+
+ relative_position_index = self.generate_relative_position_index(window_size)
+ relative_position_bias = new_relative_position_bias_table[relative_position_index.view(-1)]
+
+ # patch_size*num_patches_height, patch_size*num_patches_width, num_attention_heads
+ relative_position_bias = relative_position_bias.view(
+ window_size[0] * window_size[1] + 1, window_size[0] * window_size[1] + 1, -1
+ )
+ # num_attention_heads, patch_size*num_patches_width, patch_size*num_patches_height
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+
+ if interpolate_pos_encoding:
+ relative_position_bias = nn.functional.interpolate(
+ relative_position_bias.unsqueeze(1),
+ size=(dim_size, dim_size),
+ mode="bilinear",
+ align_corners=False,
+ ).squeeze(1)
+
+ return relative_position_bias.unsqueeze(0)
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitEncoder with Beit->Data2VecVision
+class Data2VecVisionEncoder(nn.Module):
+ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
+ super().__init__()
+ self.config = config
+ self.has_relative_position_bias = config.use_shared_relative_position_bias
+ if self.has_relative_position_bias:
+ self.relative_position_bias = Data2VecVisionRelativePositionBias(config, window_size=window_size)
+
+ # stochastic depth decay rule
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu")]
+ self.layer = nn.ModuleList(
+ [
+ Data2VecVisionLayer(
+ config,
+ window_size=window_size if config.use_relative_position_bias else None,
+ drop_path_rate=dpr[i],
+ )
+ for i in range(config.num_hidden_layers)
+ ]
+ )
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ interpolate_pos_encoding: bool = False,
+ resolution: Optional[Tuple[int, int]] = None,
+ return_dict: bool = True,
+ ) -> Union[tuple, BaseModelOutput]:
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if self.has_relative_position_bias:
+ height, width = resolution
+ window_size = (height // self.config.patch_size, width // self.config.patch_size)
+ relative_position_bias = self.relative_position_bias(
+ window_size, interpolate_pos_encoding=interpolate_pos_encoding, dim_size=hidden_states.shape[1]
+ )
+ else:
+ relative_position_bias = None
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ hidden_states,
+ layer_head_mask,
+ output_attentions,
+ relative_position_bias,
+ interpolate_pos_encoding,
+ resolution,
+ )
+ else:
+ layer_outputs = layer_module(
+ hidden_states,
+ layer_head_mask,
+ output_attentions,
+ relative_position_bias,
+ interpolate_pos_encoding,
+ resolution,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitPreTrainedModel with Beit->Data2VecVision,beit->data2vec_vision
+class Data2VecVisionPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = Data2VecVisionConfig
+ base_model_prefix = "data2vec_vision"
+ main_input_name = "pixel_values"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["Data2VecVisionLayer"]
+ _keys_to_ignore_on_load_unexpected = [r".*relative_position_index.*"]
+ _supports_sdpa = True
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, Data2VecVisionEmbeddings):
+ module.cls_token.data.zero_()
+ if module.mask_token is not None:
+ module.mask_token.data.zero_()
+ if module.position_embeddings is not None:
+ module.position_embeddings.data.zero_()
+ elif isinstance(module, Data2VecVisionRelativePositionBias):
+ module.relative_position_bias_table.data.zero_()
+ elif isinstance(module, Data2VecVisionLayer):
+ if module.lambda_1 is not None:
+ module.lambda_1.data.fill_(self.config.layer_scale_init_value)
+ module.lambda_2.data.fill_(self.config.layer_scale_init_value)
+
+
+DATA2VEC_VISION_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+ as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`Data2VecVisionConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DATA2VEC_VISION_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+ [`BeitImageProcessor.__call__`] for details.
+
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+ Whether to interpolate the pre-trained position encodings.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare Data2VecVision Model transformer outputting raw hidden-states without any specific head on top.",
+ DATA2VEC_VISION_START_DOCSTRING,
+)
+# Copied from transformers.models.beit.modeling_beit.BeitModel with BEIT->DATA2VEC_VISION,Beit->Data2VecVision,True->False
+class Data2VecVisionModel(Data2VecVisionPreTrainedModel):
+ def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = False) -> None:
+ super().__init__(config)
+ self.config = config
+
+ self.embeddings = Data2VecVisionEmbeddings(config)
+ self.encoder = Data2VecVisionEncoder(config, window_size=self.embeddings.patch_embeddings.patch_shape)
+
+ self.layernorm = (
+ nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ )
+ self.pooler = Data2VecVisionPooler(config) if add_pooling_layer else None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embeddings.patch_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=Data2VecVisionModelOutputWithPooling,
+ config_class=_CONFIG_FOR_DOC,
+ modality="vision",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def forward(
+ self,
+ pixel_values: torch.Tensor,
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ return_dict: Optional[bool] = None,
+ ) -> Union[tuple, Data2VecVisionModelOutputWithPooling]:
+ r"""
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+ embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+ resolution = pixel_values.shape[2:]
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ resolution=resolution,
+ return_dict=return_dict,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ )
+ sequence_output = encoder_outputs[0]
+ sequence_output = self.layernorm(sequence_output)
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+ if not return_dict:
+ head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+ return head_outputs + encoder_outputs[1:]
+
+ return Data2VecVisionModelOutputWithPooling(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitPooler with Beit->Data2VecVision
+class Data2VecVisionPooler(nn.Module):
+ def __init__(self, config: Data2VecVisionConfig) -> None:
+ super().__init__()
+ self.layernorm = (
+ nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if config.use_mean_pooling else None
+ )
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ if self.layernorm is not None:
+ # Mean pool the final hidden states of the patch tokens
+ patch_tokens = hidden_states[:, 1:, :]
+ pooled_output = self.layernorm(patch_tokens.mean(1))
+ else:
+ # Pool by simply taking the final hidden state of the [CLS] token
+ pooled_output = hidden_states[:, 0]
+
+ return pooled_output
+
+
+@add_start_docstrings(
+ """
+ Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
+ the final hidden states of the patch tokens) e.g. for ImageNet.
+ """,
+ DATA2VEC_VISION_START_DOCSTRING,
+)
+# Copied from transformers.models.beit.modeling_beit.BeitForImageClassification with BEIT->DATA2VEC_VISION,Beit->Data2VecVision,beit->data2vec_vision
+class Data2VecVisionForImageClassification(Data2VecVisionPreTrainedModel):
+ def __init__(self, config: Data2VecVisionConfig) -> None:
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.data2vec_vision = Data2VecVisionModel(config, add_pooling_layer=True)
+
+ # Classifier head
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
+ output_type=ImageClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ return_dict: Optional[bool] = None,
+ ) -> Union[tuple, ImageClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ outputs = self.data2vec_vision(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ return_dict=return_dict,
+ )
+
+ pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+ logits = self.classifier(pooled_output)
+
+ loss = None
+ if labels is not None:
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return ImageClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitConvModule with Beit->Data2VecVision
+class Data2VecVisionConvModule(nn.Module):
+ """
+ A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
+ layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+
+ Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Tuple[int, int]],
+ padding: Union[int, Tuple[int, int], str] = 0,
+ bias: bool = False,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ ) -> None:
+ super().__init__()
+ self.conv = nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ padding=padding,
+ bias=bias,
+ dilation=dilation,
+ )
+ self.bn = nn.BatchNorm2d(out_channels)
+ self.activation = nn.ReLU()
+
+ def forward(self, input: torch.Tensor) -> torch.Tensor:
+ output = self.conv(input)
+ output = self.bn(output)
+ output = self.activation(output)
+
+ return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitPyramidPoolingBlock with Beit->Data2VecVision
+class Data2VecVisionPyramidPoolingBlock(nn.Module):
+ def __init__(self, pool_scale: int, in_channels: int, channels: int) -> None:
+ super().__init__()
+ self.layers = [
+ nn.AdaptiveAvgPool2d(pool_scale),
+ Data2VecVisionConvModule(in_channels, channels, kernel_size=1),
+ ]
+ for i, layer in enumerate(self.layers):
+ self.add_module(str(i), layer)
+
+ def forward(self, input: torch.Tensor) -> torch.Tensor:
+ hidden_state = input
+ for layer in self.layers:
+ hidden_state = layer(hidden_state)
+ return hidden_state
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitPyramidPoolingModule with Beit->Data2VecVision
+class Data2VecVisionPyramidPoolingModule(nn.Module):
+ """
+ Pyramid Pooling Module (PPM) used in PSPNet.
+
+ Args:
+ pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+ Module.
+ in_channels (int): Input channels.
+ channels (int): Channels after modules, before conv_seg.
+ align_corners (bool): align_corners argument of F.interpolate.
+
+ Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+ """
+
+ def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, channels: int, align_corners: bool) -> None:
+ super().__init__()
+ self.pool_scales = pool_scales
+ self.align_corners = align_corners
+ self.in_channels = in_channels
+ self.channels = channels
+ self.blocks = []
+ for i, pool_scale in enumerate(pool_scales):
+ block = Data2VecVisionPyramidPoolingBlock(
+ pool_scale=pool_scale, in_channels=in_channels, channels=channels
+ )
+ self.blocks.append(block)
+ self.add_module(str(i), block)
+
+ def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+ ppm_outs = []
+ for ppm in self.blocks:
+ ppm_out = ppm(x)
+ upsampled_ppm_out = nn.functional.interpolate(
+ ppm_out, size=x.size()[2:], mode="bilinear", align_corners=self.align_corners
+ )
+ ppm_outs.append(upsampled_ppm_out)
+ return ppm_outs
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitUperHead with Beit->Data2VecVision
+class Data2VecVisionUperHead(nn.Module):
+ """
+ Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
+ [UPerNet](https://arxiv.org/abs/1807.10221).
+
+ Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+ """
+
+ def __init__(self, config: Data2VecVisionConfig) -> None:
+ super().__init__()
+
+ self.pool_scales = config.pool_scales # e.g. (1, 2, 3, 6)
+ self.in_channels = [config.hidden_size] * 4 # e.g. [768, 768, 768, 768]
+ self.channels = config.hidden_size
+ self.align_corners = False
+ self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
+
+ # PSP Module
+ self.psp_modules = Data2VecVisionPyramidPoolingModule(
+ self.pool_scales,
+ self.in_channels[-1],
+ self.channels,
+ align_corners=self.align_corners,
+ )
+ self.bottleneck = Data2VecVisionConvModule(
+ self.in_channels[-1] + len(self.pool_scales) * self.channels,
+ self.channels,
+ kernel_size=3,
+ padding=1,
+ )
+ # FPN Module
+ self.lateral_convs = nn.ModuleList()
+ self.fpn_convs = nn.ModuleList()
+ for in_channels in self.in_channels[:-1]: # skip the top layer
+ l_conv = Data2VecVisionConvModule(in_channels, self.channels, kernel_size=1)
+ fpn_conv = Data2VecVisionConvModule(self.channels, self.channels, kernel_size=3, padding=1)
+ self.lateral_convs.append(l_conv)
+ self.fpn_convs.append(fpn_conv)
+
+ self.fpn_bottleneck = Data2VecVisionConvModule(
+ len(self.in_channels) * self.channels,
+ self.channels,
+ kernel_size=3,
+ padding=1,
+ )
+
+ def psp_forward(self, inputs):
+ x = inputs[-1]
+ psp_outs = [x]
+ psp_outs.extend(self.psp_modules(x))
+ psp_outs = torch.cat(psp_outs, dim=1)
+ output = self.bottleneck(psp_outs)
+
+ return output
+
+ def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+ # build laterals
+ laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]
+
+ laterals.append(self.psp_forward(encoder_hidden_states))
+
+ # build top-down path
+ used_backbone_levels = len(laterals)
+ for i in range(used_backbone_levels - 1, 0, -1):
+ prev_shape = laterals[i - 1].shape[2:]
+ laterals[i - 1] = laterals[i - 1] + nn.functional.interpolate(
+ laterals[i], size=prev_shape, mode="bilinear", align_corners=self.align_corners
+ )
+
+ # build outputs
+ fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
+ # append psp feature
+ fpn_outs.append(laterals[-1])
+
+ for i in range(used_backbone_levels - 1, 0, -1):
+ fpn_outs[i] = nn.functional.interpolate(
+ fpn_outs[i], size=fpn_outs[0].shape[2:], mode="bilinear", align_corners=self.align_corners
+ )
+ fpn_outs = torch.cat(fpn_outs, dim=1)
+ output = self.fpn_bottleneck(fpn_outs)
+ output = self.classifier(output)
+
+ return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitFCNHead with Beit->Data2VecVision
+class Data2VecVisionFCNHead(nn.Module):
+ """
+ Fully Convolution Networks for Semantic Segmentation. This head is implemented of
+ [FCNNet](https://arxiv.org/abs/1411.4038>).
+
+ Args:
+ config (Data2VecVisionConfig): Configuration.
+ in_channels
+ kernel_size (int): The kernel size for convs in the head. Default: 3.
+ dilation (int): The dilation rate for convs in the head. Default: 1.
+
+
+ Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+ """
+
+ def __init__(
+ self,
+ config: Data2VecVisionConfig,
+ in_index: int = 2,
+ kernel_size: int = 3,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ ) -> None:
+ super().__init__()
+ self.in_channels = config.hidden_size
+ self.channels = config.auxiliary_channels
+ self.num_convs = config.auxiliary_num_convs
+ self.concat_input = config.auxiliary_concat_input
+ self.in_index = in_index
+
+ conv_padding = (kernel_size // 2) * dilation
+ convs = []
+ convs.append(
+ Data2VecVisionConvModule(
+ self.in_channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
+ )
+ )
+ for i in range(self.num_convs - 1):
+ convs.append(
+ Data2VecVisionConvModule(
+ self.channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
+ )
+ )
+ if self.num_convs == 0:
+ self.convs = nn.Identity()
+ else:
+ self.convs = nn.Sequential(*convs)
+ if self.concat_input:
+ self.conv_cat = Data2VecVisionConvModule(
+ self.in_channels + self.channels, self.channels, kernel_size=kernel_size, padding=kernel_size // 2
+ )
+
+ self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
+
+ def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+ # just take the relevant feature maps
+ hidden_states = encoder_hidden_states[self.in_index]
+ output = self.convs(hidden_states)
+ if self.concat_input:
+ output = self.conv_cat(torch.cat([hidden_states, output], dim=1))
+ output = self.classifier(output)
+ return output
+
+
+@add_start_docstrings(
+ """
+ Data2VecVision Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
+ """,
+ DATA2VEC_VISION_START_DOCSTRING,
+)
+# Copied from transformers.models.beit.modeling_beit.BeitForSemanticSegmentation with BEIT->DATA2VEC_VISION,Beit->Data2VecVision,microsoft/beit-base-finetuned-ade-640-640->facebook/data2vec-vision-base,beit->data2vec_vision
+class Data2VecVisionForSemanticSegmentation(Data2VecVisionPreTrainedModel):
+ def __init__(self, config: Data2VecVisionConfig) -> None:
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.data2vec_vision = Data2VecVisionModel(config, add_pooling_layer=False)
+
+ # FPNs
+ if len(self.config.out_indices) != 4:
+ raise ValueError(
+ "Data2VecVisionForSemanticSegmentation requires config.out_indices to be a list of 4 integers, "
+ "specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of "
+ "a base-sized architecture."
+ )
+ self.fpn1 = nn.Sequential(
+ nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
+ nn.BatchNorm2d(config.hidden_size),
+ nn.GELU(),
+ nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
+ )
+ self.fpn2 = nn.Sequential(
+ nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
+ )
+ self.fpn3 = nn.Identity()
+ self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
+
+ # Semantic segmentation head(s)
+ self.decode_head = Data2VecVisionUperHead(config)
+ self.auxiliary_head = Data2VecVisionFCNHead(config) if config.use_auxiliary_head else None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def compute_loss(self, logits, auxiliary_logits, labels):
+ # upsample logits to the images' original size
+ upsampled_logits = nn.functional.interpolate(
+ logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+ )
+ if auxiliary_logits is not None:
+ upsampled_auxiliary_logits = nn.functional.interpolate(
+ auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+ )
+ # compute weighted loss
+ loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
+ main_loss = loss_fct(upsampled_logits, labels)
+ loss = main_loss
+ if auxiliary_logits is not None:
+ auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
+ loss += self.config.auxiliary_loss_weight * auxiliary_loss
+
+ return loss
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ return_dict: Optional[bool] = None,
+ ) -> Union[tuple, SemanticSegmenterOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+ Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, Data2VecVisionForSemanticSegmentation
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("facebook/data2vec-vision-base")
+ >>> model = Data2VecVisionForSemanticSegmentation.from_pretrained("facebook/data2vec-vision-base")
+
+ >>> inputs = image_processor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> # logits are of shape (batch_size, num_labels, height, width)
+ >>> logits = outputs.logits
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+
+ if labels is not None and self.config.num_labels == 1:
+ raise ValueError("The number of labels should be greater than one")
+
+ outputs = self.data2vec_vision(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=True, # we need the intermediate hidden states
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ return_dict=return_dict,
+ )
+
+ encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+ # only keep certain features, and reshape
+ # note that we do +1 as the encoder_hidden_states also includes the initial embeddings
+ features = [feature for idx, feature in enumerate(encoder_hidden_states) if idx + 1 in self.config.out_indices]
+ batch_size = pixel_values.shape[0]
+ patch_resolution = self.config.image_size // self.config.patch_size
+ features = [
+ x[:, 1:, :].permute(0, 2, 1).reshape(batch_size, -1, patch_resolution, patch_resolution) for x in features
+ ]
+
+ # apply FPNs
+ ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+ for i in range(len(features)):
+ features[i] = ops[i](features[i])
+
+ logits = self.decode_head(features)
+
+ auxiliary_logits = None
+ if self.auxiliary_head is not None:
+ auxiliary_logits = self.auxiliary_head(features)
+
+ loss = None
+ if labels is not None:
+ loss = self.compute_loss(logits, auxiliary_logits, labels)
+
+ if not return_dict:
+ if output_hidden_states:
+ output = (logits,) + outputs[1:]
+ else:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SemanticSegmenterOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states if output_hidden_states else None,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = [
+ "Data2VecVisionForImageClassification",
+ "Data2VecVisionForSemanticSegmentation",
+ "Data2VecVisionModel",
+ "Data2VecVisionPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/docs/transformers/build/lib/transformers/models/data2vec/modeling_tf_data2vec_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..813fad89dc5412c46bcdf21d11391670a3acbe45
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -0,0 +1,1724 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 Data2Vec Vision model."""
+
+from __future__ import annotations
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+ TFBaseModelOutput,
+ TFBaseModelOutputWithPooling,
+ TFSemanticSegmenterOutput,
+ TFSequenceClassifierOutput,
+)
+from ...modeling_tf_utils import (
+ TFModelInputType,
+ TFPreTrainedModel,
+ TFSequenceClassificationLoss,
+ get_initializer,
+ keras,
+ keras_serializable,
+ unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_data2vec_vision import Data2VecVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "Data2VecVisionConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/data2vec-vision-base-ft1k"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "remote control, remote"
+
+
+@dataclass
+class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling):
+ """
+ Class for outputs of [`TFData2VecVisionModel`].
+
+ Args:
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
+ Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
+ *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
+ will be returned.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+ `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ last_hidden_state: Optional[tf.Tensor] = None
+ pooler_output: Optional[tf.Tensor] = None
+ hidden_states: Tuple[tf.Tensor] | None = None
+ attentions: Tuple[tf.Tensor] | None = None
+
+
+class TFData2VecVisionDropPath(keras.layers.Layer):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ References:
+ (1) github.com:rwightman/pytorch-image-models
+ """
+
+ def __init__(self, drop_path, **kwargs):
+ super().__init__(**kwargs)
+ self.drop_path = drop_path
+
+ def call(self, x, training=None):
+ if training:
+ keep_prob = 1 - self.drop_path
+ shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+ random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+ random_tensor = tf.floor(random_tensor)
+ return (x / keep_prob) * random_tensor
+ return x
+
+
+class TFData2VecVisionEmbeddings(keras.layers.Layer):
+ """
+ Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+
+ """
+
+ def __init__(self, config: Data2VecVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+
+ self.patch_embeddings = TFData2VecVisionPatchEmbeddings(config, name="patch_embeddings")
+ self.num_patches = self.patch_embeddings.num_patches
+ self.config = config
+
+ self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+
+ def build(self, input_shape=None):
+ self.cls_token = self.add_weight(
+ shape=(1, 1, self.config.hidden_size),
+ initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
+ trainable=True,
+ name="cls_token",
+ )
+ if self.config.use_mask_token:
+ self.mask_token = self.add_weight(
+ shape=(1, 1, self.config.hidden_size),
+ initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
+ trainable=True,
+ name="mask_token",
+ )
+ else:
+ self.mask_token = None
+
+ if self.config.use_absolute_position_embeddings:
+ self.position_embeddings = self.add_weight(
+ shape=(1, self.num_patches + 1, self.config.hidden_size),
+ initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
+ trainable=True,
+ name="position_embeddings",
+ )
+ else:
+ self.position_embeddings = None
+
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "patch_embeddings", None) is not None:
+ with tf.name_scope(self.patch_embeddings.name):
+ self.patch_embeddings.build(None)
+
+ def call(self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None) -> tf.Tensor:
+ embeddings = self.patch_embeddings(pixel_values)
+ batch_size, seq_len, projection_dim = shape_list(embeddings)
+
+ cls_tokens = tf.tile(self.cls_token, (batch_size, 1, 1))
+
+ if bool_masked_pos is not None:
+ mask_tokens = tf.broadcast_to(self.mask_token, (batch_size, seq_len, projection_dim))
+ # replace the masked visual tokens by mask_tokens
+ w = bool_masked_pos[..., None]
+ w = tf.cast(w, mask_tokens.dtype)
+ # since TF doesn't support eager tensor assignment
+ embeddings = embeddings * (1 - w) + mask_tokens * w
+
+ embeddings = tf.concat([cls_tokens, embeddings], axis=1)
+ if self.position_embeddings is not None:
+ embeddings = embeddings + self.position_embeddings
+ embeddings = self.dropout(embeddings)
+
+ return embeddings
+
+
+class TFData2VecVisionPatchEmbeddings(keras.layers.Layer):
+ """
+ Image to Patch Embedding.
+ """
+
+ def __init__(self, config: Data2VecVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+
+ image_size, patch_size = config.image_size, config.patch_size
+ num_channels, hidden_size = config.num_channels, config.hidden_size
+
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+ self.patch_shape = patch_shape
+ self.num_channels = num_channels
+
+ self.projection = keras.layers.Conv2D(
+ filters=hidden_size,
+ kernel_size=patch_size,
+ strides=patch_size,
+ padding="valid",
+ data_format="channels_last",
+ kernel_initializer="glorot_uniform", # following torch.nn.Linear
+ bias_initializer="zeros",
+ name="projection",
+ )
+
+ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+ batch_size, num_channels, height, width = shape_list(pixel_values)
+ if tf.executing_eagerly():
+ if num_channels != self.num_channels:
+ raise ValueError(
+ "Make sure that the channel dimension of the pixel values match with the one set in the"
+ " configuration."
+ )
+ if height != self.image_size[0] or width != self.image_size[1]:
+ raise ValueError(
+ f"Input image size ({height}*{width}) doesn't match model"
+ f" ({self.image_size[0]}*{self.image_size[1]})."
+ )
+
+ # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
+ # So change the input format from `NCHW` to `NHWC`.
+ # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+ pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+ projection = self.projection(pixel_values)
+
+ # Change the 2D spatial dimensions to a single temporal dimension.
+ # shape = (batch_size, num_patches, out_channels=embed_dim)
+ num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0])
+
+ return tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1))
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "projection", None) is not None:
+ with tf.name_scope(self.projection.name):
+ self.projection.build([None, None, None, self.num_channels])
+
+
+class TFData2VecVisionSelfAttention(keras.layers.Layer):
+ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
+ super().__init__(**kwargs)
+
+ if config.hidden_size % config.num_attention_heads != 0:
+ raise ValueError(
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+ f"of attention heads ({config.num_attention_heads})"
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+ self.query = keras.layers.Dense(
+ units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+ )
+ self.key = keras.layers.Dense(
+ units=self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="key",
+ use_bias=False,
+ )
+ self.value = keras.layers.Dense(
+ units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+ )
+ self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+
+ if window_size:
+ self.relative_position_bias = TFData2VecVisionRelativePositionBias(
+ config, window_size=window_size, name="relative_position_bias"
+ )
+ else:
+ self.relative_position_bias = None
+ self.config = config
+
+ def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+ # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+ tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+ # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+ return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ head_mask: tf.Tensor,
+ output_attentions: bool,
+ relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ batch_size = shape_list(hidden_states)[0]
+ mixed_query_layer = self.query(inputs=hidden_states)
+ mixed_key_layer = self.key(inputs=hidden_states)
+ mixed_value_layer = self.value(inputs=hidden_states)
+ query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+ key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+ value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ # (batch size, num_heads, seq_len_q, seq_len_k)
+ attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+ attention_scores = attention_scores / self.sqrt_att_head_size
+
+ # Add relative position bias if present.
+ if self.relative_position_bias is not None:
+ # Passing `0.0` to the `relative_position_bias()` layer because otherwise Keras
+ # might complain about `Layer.call()` not being invoked properly. In this case this input
+ # i.e., 0.0 is not going to be used in any calculations so we're safe.
+ attention_scores = attention_scores + self.relative_position_bias(0.0)[None, ...]
+
+ # Add shared relative position bias if provided.
+ if relative_position_bias is not None:
+ attention_scores = attention_scores + relative_position_bias
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = tf.multiply(attention_probs, head_mask)
+
+ attention_output = tf.matmul(attention_probs, value_layer)
+ attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+ # (batch_size, seq_len_q, all_head_size)
+ attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+ outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "query", None) is not None:
+ with tf.name_scope(self.query.name):
+ self.query.build([None, None, self.config.hidden_size])
+ if getattr(self, "key", None) is not None:
+ with tf.name_scope(self.key.name):
+ self.key.build([None, None, self.config.hidden_size])
+ if getattr(self, "value", None) is not None:
+ with tf.name_scope(self.value.name):
+ self.value.build([None, None, self.config.hidden_size])
+ if getattr(self, "relative_position_bias", None) is not None:
+ with tf.name_scope(self.relative_position_bias.name):
+ self.relative_position_bias.build(None)
+
+
+class TFData2VecVisionSelfOutput(keras.layers.Layer):
+ """
+ The residual connection is defined in TFData2VecVisionLayer instead of here (as is the case with other models), due
+ to the layernorm applied before each block.
+ """
+
+ def __init__(self, config: Data2VecVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(inputs=hidden_states, training=training)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFData2VecVisionAttention(keras.layers.Layer):
+ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
+ super().__init__(**kwargs)
+
+ self.attention = TFData2VecVisionSelfAttention(config, window_size=window_size, name="attention")
+ self.dense_output = TFData2VecVisionSelfOutput(config, name="output")
+
+ def prune_heads(self, heads):
+ raise NotImplementedError
+
+ def call(
+ self,
+ input_tensor: tf.Tensor,
+ head_mask: tf.Tensor,
+ output_attentions: bool,
+ relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ self_outputs = self.attention(
+ hidden_states=input_tensor,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ relative_position_bias=relative_position_bias,
+ training=training,
+ )
+ attention_output = self.dense_output(
+ hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+ )
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "dense_output", None) is not None:
+ with tf.name_scope(self.dense_output.name):
+ self.dense_output.build(None)
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->Data2VecVision
+class TFData2VecVisionIntermediate(keras.layers.Layer):
+ def __init__(self, config: Data2VecVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+ else:
+ self.intermediate_act_fn = config.hidden_act
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFData2VecVisionOutput(keras.layers.Layer):
+ def __init__(self, config: Data2VecVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(inputs=hidden_states, training=training)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.intermediate_size])
+
+
+class TFData2VecVisionLayer(keras.layers.Layer):
+ """This corresponds to the Block class in the timm implementation."""
+
+ def __init__(
+ self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, drop_path_rate: float = 0.0, **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.config = config
+
+ self.attention = TFData2VecVisionAttention(config, window_size=window_size, name="attention")
+ self.intermediate = TFData2VecVisionIntermediate(config, name="intermediate")
+ self.data2vec_output = TFData2VecVisionOutput(config, name="output")
+
+ self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
+ self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
+ # Using `layers.Activation` instead of `tf.identity` to better control `training`
+ # behaviour.
+ self.drop_path = (
+ TFData2VecVisionDropPath(drop_path_rate, name="drop_path")
+ if drop_path_rate > 0.0
+ else keras.layers.Activation("linear", name="drop_path")
+ )
+ self.init_values = config.layer_scale_init_value
+
+ def build(self, input_shape: tf.TensorShape = None):
+ if self.init_values > 0:
+ self.lambda_1 = self.add_weight(
+ shape=(self.config.hidden_size),
+ initializer="ones",
+ trainable=True,
+ name="lambda_1",
+ )
+ self.lambda_2 = self.add_weight(
+ shape=(self.config.hidden_size),
+ initializer="ones",
+ trainable=True,
+ name="lambda_2",
+ )
+ self.lambda_1.assign(self.init_values * tf.ones((self.config.hidden_size)))
+ self.lambda_2.assign(self.init_values * tf.ones((self.config.hidden_size)))
+ else:
+ self.lambda_1, self.lambda_2 = None, None
+
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "intermediate", None) is not None:
+ with tf.name_scope(self.intermediate.name):
+ self.intermediate.build(None)
+ if getattr(self, "data2vec_output", None) is not None:
+ with tf.name_scope(self.data2vec_output.name):
+ self.data2vec_output.build(None)
+ if getattr(self, "layernorm_before", None) is not None:
+ with tf.name_scope(self.layernorm_before.name):
+ self.layernorm_before.build([None, None, self.config.hidden_size])
+ if getattr(self, "layernorm_after", None) is not None:
+ with tf.name_scope(self.layernorm_after.name):
+ self.layernorm_after.build([None, None, self.config.hidden_size])
+ if getattr(self, "drop_path", None) is not None:
+ with tf.name_scope(self.drop_path.name):
+ self.drop_path.build(None)
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ head_mask: tf.Tensor,
+ output_attentions: bool,
+ relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ self_attention_outputs = self.attention(
+ # in Data2VecVision, layernorm is applied before self-attention
+ input_tensor=self.layernorm_before(inputs=hidden_states),
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ relative_position_bias=relative_position_bias,
+ training=training,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
+
+ # apply lambda_1 if present
+ if self.lambda_1 is not None:
+ attention_output = self.lambda_1 * attention_output
+
+ # first residual connection
+ hidden_states = self.drop_path(attention_output) + hidden_states
+
+ # in Data2VecVision, layernorm is also applied after self-attention
+ layer_output = self.layernorm_after(hidden_states)
+
+ layer_output = self.intermediate(layer_output)
+ layer_output = self.data2vec_output(layer_output)
+
+ if self.lambda_2 is not None:
+ layer_output = self.lambda_2 * layer_output
+
+ # second residual connection
+ layer_output = self.drop_path(layer_output) + hidden_states
+
+ outputs = (layer_output,) + outputs
+
+ return outputs
+
+
+# Taken and modified from here:
+# https://github.com/leondgarse/keras_cv_attention_models/blob/main/keras_cv_attention_models/beit/beit.py#L28
+class TFData2VecVisionRelativePositionBias(keras.layers.Layer):
+ def __init__(self, config: Data2VecVisionConfig, window_size: tuple, **kwargs) -> None:
+ super().__init__(**kwargs)
+ self.config = config
+
+ self.window_size = window_size
+ # +3 for cls_token_pos_len
+ # window_size can be something like (14, 14)
+ self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+
+ self.relative_position_index = self.get_position_index()
+
+ def build(self, input_shape):
+ self.relative_position_bias_table = self.add_weight(
+ shape=(self.num_relative_distance, self.config.num_attention_heads),
+ initializer="zeros",
+ trainable=True,
+ name="relative_position_bias_table",
+ ) # [2*Wh-1 * 2*Ww-1, nH]
+ # cls to token & token 2 cls & cls to cls
+
+ super().build(input_shape)
+
+ def get_position_index(self):
+ # get pair-wise relative position index for each token inside the window
+ xx, yy = tf.meshgrid(range(self.window_size[0]), range(self.window_size[1]))
+ coords = tf.stack([yy, xx], axis=0) # [2, Wh, Ww]
+ coords_flatten = tf.reshape(coords, [2, -1]) # [2, Wh*Ww]
+
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # [2, Wh*Ww, Wh*Ww]
+ relative_coords = tf.transpose(relative_coords, perm=[1, 2, 0]) # [Wh*Ww, Wh*Ww, 2]
+
+ xx = (relative_coords[:, :, 0] + self.window_size[0] - 1) * (2 * self.window_size[1] - 1)
+ yy = relative_coords[:, :, 1] + self.window_size[1] - 1
+ relative_coords = tf.stack([xx, yy], axis=-1)
+
+ relative_position_index = tf.reduce_sum(relative_coords, axis=-1) # [Wh*Ww, Wh*Ww]
+
+ top = tf.ones((1, relative_position_index.shape[1]), dtype=relative_position_index.dtype) * (
+ self.num_relative_distance - 3
+ )
+ left = tf.ones((relative_position_index.shape[0], 1), dtype=relative_position_index.dtype) * (
+ self.num_relative_distance - 2
+ )
+ corner = tf.ones((1, 1), dtype=relative_position_index.dtype) * (self.num_relative_distance - 1)
+
+ left_corner = tf.concat([corner, left], axis=0)
+ relative_position_index = tf.concat([top, relative_position_index], axis=0)
+ relative_position_index = tf.concat([left_corner, relative_position_index], axis=1) # [Wh*Ww + 1, Wh*Ww + 1]
+ return relative_position_index
+
+ def call(self, inputs=None) -> tf.Tensor:
+ relative_position_bias = tf.gather(self.relative_position_bias_table, self.relative_position_index, axis=0)
+ return tf.transpose(relative_position_bias, [2, 0, 1])
+
+
+class TFData2VecVisionEncoder(keras.layers.Layer):
+ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ if config.use_shared_relative_position_bias:
+ self.relative_position_bias = TFData2VecVisionRelativePositionBias(
+ config, window_size=window_size, name="relative_position_bias"
+ )
+ else:
+ self.relative_position_bias = None
+
+ # stochastic depth decay rule
+ dpr = list(tf.linspace(0.0, config.drop_path_rate, config.num_hidden_layers))
+ self.layer = [
+ TFData2VecVisionLayer(
+ config,
+ window_size=window_size if config.use_relative_position_bias else None,
+ drop_path_rate=dpr[i],
+ name=f"layer_._{i}",
+ )
+ for i in range(config.num_hidden_layers)
+ ]
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ ) -> Union[tuple, TFBaseModelOutput]:
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+ # Passing `0.0` to the `relative_position_bias()` layer because otherwise Keras
+ # might complain about `Layer.call()` not being invoked properly. In this case this input
+ # i.e., 0.0 is not going to be used in any calculations so we're safe.
+ relative_position_bias = (
+ self.relative_position_bias(0.0) if self.relative_position_bias is not None else None
+ )
+ layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, relative_position_bias)
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+ return TFBaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "relative_position_bias", None) is not None:
+ with tf.name_scope(self.relative_position_bias.name):
+ self.relative_position_bias.build(None)
+ if getattr(self, "layer", None) is not None:
+ for layer in self.layer:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+@keras_serializable
+class TFData2VecVisionMainLayer(keras.layers.Layer):
+ config_class = Data2VecVisionConfig
+
+ def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = True, **kwargs):
+ super().__init__(**kwargs)
+
+ self.config = config
+ self.add_pooling_layer = add_pooling_layer
+
+ self.embeddings = TFData2VecVisionEmbeddings(config, name="embeddings")
+ self.encoder = TFData2VecVisionEncoder(
+ config, window_size=self.embeddings.patch_embeddings.patch_shape, name="encoder"
+ )
+ self.layernorm = (
+ tf.identity
+ if config.use_mean_pooling
+ else keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+ )
+
+ # We are setting the `data_format` like so because from here on we will revert to the
+ # NCHW output format
+ self.pooler = TFData2VecVisionPooler(config, name="pooler") if add_pooling_layer else None
+
+ def get_input_embeddings(self) -> keras.layers.Layer:
+ return self.embeddings.patch_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ raise NotImplementedError
+
+ @unpack_inputs
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ bool_masked_pos: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[tuple, TFData2VecVisionModelOutputWithPooling]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ if head_mask is not None:
+ raise NotImplementedError
+ else:
+ head_mask = [None] * self.config.num_hidden_layers
+
+ embedding_output = self.embeddings(pixel_values, bool_masked_pos, training=training)
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = encoder_outputs[0]
+ sequence_output = self.layernorm(sequence_output)
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+ if not return_dict:
+ head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+ return head_outputs + encoder_outputs[1:]
+
+ return TFData2VecVisionModelOutputWithPooling(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "embeddings", None) is not None:
+ with tf.name_scope(self.embeddings.name):
+ self.embeddings.build(None)
+ if getattr(self, "encoder", None) is not None:
+ with tf.name_scope(self.encoder.name):
+ self.encoder.build(None)
+ if getattr(self, "layernorm", None) is not None:
+ if hasattr(self.layernorm, "name"):
+ with tf.name_scope(self.layernorm.name):
+ self.layernorm.build((None, self.config.hidden_size))
+ if getattr(self, "pooler", None) is not None:
+ with tf.name_scope(self.pooler.name):
+ self.pooler.build(None)
+
+
+class TFData2VecVisionPooler(keras.layers.Layer):
+ def __init__(self, config: Data2VecVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.layernorm = (
+ keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+ if config.use_mean_pooling
+ else None
+ )
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ if self.layernorm is not None:
+ # Mean pool the final hidden states of the patch tokens
+ patch_tokens = hidden_states[:, 1:, :]
+ pooled_output = self.layernorm(tf.reduce_mean(patch_tokens, axis=1))
+ else:
+ # Pool by simply taking the final hidden state of the [CLS] token
+ pooled_output = hidden_states[:, 0]
+
+ return pooled_output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "layernorm", None) is not None:
+ if hasattr(self.layernorm, "name"):
+ with tf.name_scope(self.layernorm.name):
+ self.layernorm.build((None, self.config.hidden_size))
+
+
+class TFData2VecVisionPreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = Data2VecVisionConfig
+ base_model_prefix = "data2vec_vision"
+ main_input_name = "pixel_values"
+ _keys_to_ignore_on_load_unexpected = [r"relative_position_index"]
+
+
+DATA2VEC_VISION_START_DOCSTRING = r"""
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.).
+
+ This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+ as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+ behavior.
+
+
+
+ TensorFlow models and layers in `transformers` accept two formats as input:
+
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional argument.
+
+ The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+ and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+ pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+ format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+ the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+ positional argument:
+
+ - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`
+
+ Note that when creating models and layers with
+ [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+ about any of this, as you can just pass inputs like you would to any other Python function!
+
+
+
+ Args:
+ config ([`Data2VecVisionConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DATA2VEC_VISION_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+ [`BeitImageProcessor.__call__`] for details.
+
+ head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
+ in eager mode, in graph mode the value will always be set to True.
+
+ training (`bool`, *optional*, defaults to `False``):
+ Whether or not to use the model in training mode (some modules like dropout modules have different
+ behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+ "The bare Data2VecVision Model transformer outputting raw hidden-states without any specific head on top.",
+ DATA2VEC_VISION_START_DOCSTRING,
+)
+class TFData2VecVisionModel(TFData2VecVisionPreTrainedModel):
+ def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = False, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+ self.config = config
+
+ self.data2vec_vision = TFData2VecVisionMainLayer(
+ config, add_pooling_layer=add_pooling_layer, name="data2vec_vision"
+ )
+
+ def get_input_embeddings(self):
+ return self.data2vec_vision.get_input_embeddings()
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFData2VecVisionModelOutputWithPooling,
+ config_class=_CONFIG_FOR_DOC,
+ modality="vision",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def call(
+ self,
+ pixel_values: TFModelInputType | None = None,
+ bool_masked_pos: tf.Tensor | None = None,
+ head_mask: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[tuple, TFData2VecVisionModelOutputWithPooling]:
+ r"""
+ bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+ """
+ outputs = self.data2vec_vision(
+ pixel_values=pixel_values,
+ bool_masked_pos=bool_masked_pos,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "data2vec_vision", None) is not None:
+ with tf.name_scope(self.data2vec_vision.name):
+ self.data2vec_vision.build(None)
+
+
+@add_start_docstrings(
+ """
+ Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
+ the final hidden states of the patch tokens) e.g. for ImageNet.
+ """,
+ DATA2VEC_VISION_START_DOCSTRING,
+)
+class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TFSequenceClassificationLoss):
+ def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.num_labels = config.num_labels
+ self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=True, name="data2vec_vision")
+
+ # Classifier head
+ self.classifier = keras.layers.Dense(
+ units=config.num_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="classifier",
+ )
+ self.config = config
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
+ output_type=TFSequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+ )
+ def call(
+ self,
+ pixel_values: TFModelInputType | None = None,
+ head_mask: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFSequenceClassifierOutput, tuple]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.data2vec_vision(
+ pixel_values=pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ pooled_output = outputs.pooler_output if return_dict else outputs[1]
+ logits = self.classifier(pooled_output)
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFSequenceClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "data2vec_vision", None) is not None:
+ with tf.name_scope(self.data2vec_vision.name):
+ self.data2vec_vision.build(None)
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.config.hidden_size])
+
+
+class TFData2VecVisionConvModule(keras.layers.Layer):
+ """
+ A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
+ layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+
+ Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Tuple[int, int]],
+ padding: str = "valid",
+ bias: bool = False,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ **kwargs,
+ ) -> None:
+ super().__init__(**kwargs)
+ self.conv = keras.layers.Conv2D(
+ filters=out_channels,
+ kernel_size=kernel_size,
+ padding=padding,
+ use_bias=bias,
+ dilation_rate=dilation,
+ name="conv",
+ )
+ self.bn = keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5)
+ self.activation = tf.nn.relu
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+
+ def call(self, input: tf.Tensor) -> tf.Tensor:
+ output = self.conv(input)
+ output = self.bn(output)
+ output = self.activation(output)
+ return output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "conv", None) is not None:
+ with tf.name_scope(self.conv.name):
+ self.conv.build([None, None, None, self.in_channels])
+ if getattr(self, "bn", None) is not None:
+ with tf.name_scope(self.bn.name):
+ self.bn.build((None, None, None, self.out_channels))
+
+
+class TFAdaptiveAvgPool2D(keras.layers.Layer):
+ def __init__(self, output_dims: Tuple[int, int], input_ordering: str = "NHWC", **kwargs):
+ super().__init__(**kwargs)
+ self.output_dims = output_dims
+ self.input_ordering = input_ordering
+ if input_ordering not in ("NCHW", "NHWC"):
+ raise ValueError("Unrecognized input_ordering, should be 'NCHW' or 'NHWC'!")
+ self.h_axis = input_ordering.index("H")
+ self.w_axis = input_ordering.index("W")
+
+ def pseudo_1d_pool(self, inputs: tf.Tensor, h_pooling: bool):
+ # Figure out which axis we're pooling on
+ if h_pooling:
+ axis = self.h_axis
+ output_dim = self.output_dims[0]
+ else:
+ axis = self.w_axis
+ output_dim = self.output_dims[1]
+ input_dim = inputs.shape[axis]
+
+ # Figure out the potential pooling windows
+ # This is the key idea - the torch op always uses only two
+ # consecutive pooling window sizes, like 3 and 4. Therefore,
+ # if we pool with both possible sizes, we simply need to gather
+ # the 'correct' pool at each position to reimplement the torch op.
+ small_window = math.ceil(input_dim / output_dim)
+ big_window = small_window + 1
+ if h_pooling:
+ output_dim = self.output_dims[0]
+ small_window_shape = (small_window, 1)
+ big_window_shape = (big_window, 1)
+ else:
+ output_dim = self.output_dims[1]
+ small_window_shape = (1, small_window)
+ big_window_shape = (1, big_window)
+
+ # For resizes to 1, or integer resizes, we can take quick shortcuts
+ if output_dim == input_dim:
+ return inputs
+ elif output_dim == 1:
+ return tf.reduce_mean(inputs, axis=axis, keepdims=True)
+ elif input_dim % output_dim == 0:
+ return tf.nn.avg_pool2d(
+ inputs,
+ ksize=small_window_shape,
+ strides=small_window_shape,
+ padding="VALID",
+ data_format=self.input_ordering,
+ )
+ # When upscaling by an integer factor we can also take a quick shortcut
+ elif output_dim > input_dim and output_dim % input_dim == 0:
+ return tf.repeat(inputs, repeats=output_dim // input_dim, axis=axis)
+
+ # For non-integer resizes, we pool with both possible window sizes and concatenate them
+ if output_dim < input_dim:
+ small_pool = tf.nn.avg_pool2d(
+ inputs, ksize=small_window_shape, strides=1, padding="VALID", data_format=self.input_ordering
+ )
+ big_pool = tf.nn.avg_pool2d(
+ inputs, ksize=big_window_shape, strides=1, padding="VALID", data_format=self.input_ordering
+ )
+ both_pool = tf.concat([small_pool, big_pool], axis=axis)
+ else:
+ # When we're actually upscaling instead, then we build the pools a bit differently
+ small_pool = inputs
+ big_pool = tf.nn.avg_pool2d(
+ inputs, ksize=big_window_shape, strides=1, padding="VALID", data_format=self.input_ordering
+ )
+ both_pool = tf.concat([small_pool, big_pool], axis=axis)
+
+ # We compute vectors of the start and end positions for each pooling window
+ # Each (start, end) pair here corresponds to a single output position
+ window_starts = tf.math.floor((tf.range(output_dim, dtype=tf.float32) * input_dim) / output_dim)
+ window_starts = tf.cast(window_starts, tf.int64)
+ window_ends = tf.math.ceil((tf.range(1, output_dim + 1, dtype=tf.float32) * input_dim) / output_dim)
+ window_ends = tf.cast(window_ends, tf.int64)
+
+ # pool_selector is a boolean array of shape (output_dim,) where 1 indicates that output position
+ # has a big receptive field and 0 indicates that that output position has a small receptive field
+ pool_selector = tf.cast(window_ends - window_starts - small_window, tf.bool)
+
+ # Since we concatenated the small and big pools, we need to do a bit of
+ # pointer arithmetic to get the indices of the big pools
+ small_indices = window_starts
+ big_indices = window_starts + small_pool.shape[axis]
+
+ # Finally, we use the pool_selector to generate a list of indices, one per output position
+ gather_indices = tf.where(pool_selector, big_indices, small_indices)
+
+ # Gathering from those indices yields the final, correct pooling
+ return tf.gather(both_pool, gather_indices, axis=axis)
+
+ def call(self, inputs: tf.Tensor):
+ if self.input_ordering == "NHWC":
+ input_shape = inputs.shape[1:3]
+ else:
+ input_shape = inputs.shape[2:]
+
+ # We break the task down into each possible case
+ # Firstly, if we're resizing down to 1, it's just tf.reduce_mean
+ if self.output_dims[0] == self.output_dims[1] == 1:
+ if self.input_ordering == "NHWC":
+ reduce_dims = [1, 2]
+ else:
+ reduce_dims = [2, 3]
+ return tf.reduce_mean(inputs, axis=reduce_dims, keepdims=True)
+ # Secondly, if we're resizing by an integer factor on both dimensions, we can take a quick shortcut
+ elif input_shape[0] % self.output_dims[0] == 0 and input_shape[1] % self.output_dims[1] == 0:
+ h_resize = int(input_shape[0] // self.output_dims[0])
+ w_resize = int(input_shape[1] // self.output_dims[1])
+ return tf.nn.avg_pool2d(
+ inputs,
+ ksize=(h_resize, w_resize),
+ strides=(h_resize, w_resize),
+ padding="VALID",
+ data_format=self.input_ordering,
+ )
+ else:
+ # Finally, if we can't take the shortcut, we do a 1D pool on each axis. pseudo_1d_pool will take a shortcut
+ # for dimensions where an integer resize is possible. It can also handle upscaling.
+ h_pooled = self.pseudo_1d_pool(inputs, h_pooling=True)
+ return self.pseudo_1d_pool(h_pooled, h_pooling=False)
+
+
+class TFData2VecVisionPyramidPoolingModule(keras.layers.Layer):
+ """
+ Pyramid Pooling Module (PPM) used in PSPNet.
+
+ Args:
+ pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+ Module.
+ channels (int): Channels after modules, before conv_seg.
+
+ Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+ """
+
+ def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, out_channels: int, **kwargs) -> None:
+ super().__init__(**kwargs)
+ self.pool_scales = pool_scales
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+
+ self.layer_list = []
+ for idx, pool_scale in enumerate(pool_scales):
+ pool_scale = pool_scale if isinstance(pool_scale, collections.abc.Iterable) else (pool_scale, pool_scale)
+ self.layer_list.append(
+ [
+ TFAdaptiveAvgPool2D(output_dims=pool_scale),
+ TFData2VecVisionConvModule(
+ in_channels=in_channels, out_channels=self.out_channels, kernel_size=1, name=f"{idx}.1"
+ ),
+ ]
+ )
+
+ def call(self, x: tf.Tensor) -> List[tf.Tensor]:
+ ppm_outs = []
+ inputs = x
+
+ for ppm in self.layer_list:
+ for layer_module in ppm:
+ ppm_out = layer_module(x)
+ x = ppm_out
+
+ upsampled_ppm_out = tf.image.resize(ppm_out, size=shape_list(inputs)[1:-1], method="bilinear")
+ ppm_outs.append(upsampled_ppm_out)
+ return ppm_outs
+
+ def build(self, input_shape=None):
+ for layer in self.layer_list:
+ for layer_module in layer:
+ with tf.name_scope(layer_module.name):
+ layer_module.build(None)
+
+
+class TFData2VecVisionUperHead(keras.layers.Layer):
+ """
+ Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
+ [UPerNet](https://arxiv.org/abs/1807.10221).
+
+ Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+ """
+
+ def __init__(self, config: Data2VecVisionConfig, **kwargs) -> None:
+ super().__init__(**kwargs)
+
+ self.pool_scales = config.pool_scales # e.g. (1, 2, 3, 6)
+ self.in_channels = [config.hidden_size] * 4 # e.g. [768, 768, 768, 768]
+ self.channels = config.hidden_size
+ self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
+
+ # PSP Module
+ self.psp_modules = TFData2VecVisionPyramidPoolingModule(
+ self.pool_scales, self.in_channels[-1], self.channels, name="psp_modules"
+ )
+ self.bottleneck = TFData2VecVisionConvModule(
+ self.in_channels[-1] + len(self.pool_scales) * self.channels,
+ self.channels,
+ kernel_size=3,
+ padding="same",
+ name="bottleneck",
+ )
+ # FPN Module
+ self.lateral_convs = []
+ self.fpn_convs = []
+ for idx, in_channels in enumerate(self.in_channels[:-1]): # skip the top layer
+ l_conv = TFData2VecVisionConvModule(
+ in_channels, out_channels=self.channels, kernel_size=1, name=f"lateral_convs.{idx}"
+ )
+ fpn_conv = TFData2VecVisionConvModule(
+ in_channels=self.channels,
+ out_channels=self.channels,
+ kernel_size=3,
+ padding="same",
+ name=f"fpn_convs.{idx}",
+ )
+ self.lateral_convs.append(l_conv)
+ self.fpn_convs.append(fpn_conv)
+
+ self.fpn_bottleneck = TFData2VecVisionConvModule(
+ in_channels=len(self.in_channels) * self.channels,
+ out_channels=self.channels,
+ kernel_size=3,
+ padding="same",
+ name="fpn_bottleneck",
+ )
+
+ def psp_forward(self, inputs):
+ x = inputs[-1]
+ psp_outs = [x]
+ psp_outs.extend(self.psp_modules(x))
+ psp_outs = tf.concat(psp_outs, axis=-1)
+ output = self.bottleneck(psp_outs)
+
+ return output
+
+ def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor:
+ # build laterals
+ laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]
+
+ laterals.append(self.psp_forward(encoder_hidden_states))
+
+ # build top-down path
+ used_backbone_levels = len(laterals)
+ for i in range(used_backbone_levels - 1, 0, -1):
+ prev_shape = shape_list(laterals[i - 1])[1:-1]
+ laterals[i - 1] = laterals[i - 1] + tf.image.resize(laterals[i], size=prev_shape, method="bilinear")
+
+ # build outputs
+ fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
+ # append psp feature
+ fpn_outs.append(laterals[-1])
+
+ for i in range(used_backbone_levels - 1, 0, -1):
+ fpn_outs[i] = tf.image.resize(fpn_outs[i], size=shape_list(fpn_outs[0])[1:-1], method="bilinear")
+ fpn_outs = tf.concat(fpn_outs, axis=-1)
+ output = self.fpn_bottleneck(fpn_outs)
+ output = self.classifier(output)
+
+ return output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, None, self.channels])
+ if getattr(self, "psp_modules", None) is not None:
+ with tf.name_scope(self.psp_modules.name):
+ self.psp_modules.build(None)
+ if getattr(self, "bottleneck", None) is not None:
+ with tf.name_scope(self.bottleneck.name):
+ self.bottleneck.build(None)
+ if getattr(self, "fpn_bottleneck", None) is not None:
+ with tf.name_scope(self.fpn_bottleneck.name):
+ self.fpn_bottleneck.build(None)
+ for layer in self.lateral_convs:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+ for layer in self.fpn_convs:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+class TFData2VecVisionFCNHead(keras.layers.Layer):
+ """
+ Fully Convolution Networks for Semantic Segmentation. This head is implemented from
+ [FCNNet](https://arxiv.org/abs/1411.4038).
+
+ Args:
+ config (Data2VecVisionConfig): Configuration.
+ kernel_size (int): The kernel size for convs in the head. Default: 3.
+ dilation (int): The dilation rate for convs in the head. Default: 1.
+
+
+ Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+ """
+
+ def __init__(
+ self,
+ config: Data2VecVisionConfig,
+ in_index: int = 2,
+ kernel_size: int = 3,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ **kwargs,
+ ) -> None:
+ super().__init__(**kwargs)
+ self.in_channels = config.hidden_size
+ self.channels = config.auxiliary_channels
+ self.num_convs = config.auxiliary_num_convs
+ self.concat_input = config.auxiliary_concat_input
+ self.in_index = in_index
+
+ convs = []
+ convs.append(
+ TFData2VecVisionConvModule(
+ in_channels=self.in_channels,
+ out_channels=self.channels,
+ kernel_size=kernel_size,
+ padding="same",
+ dilation=dilation,
+ name="convs.0",
+ )
+ )
+ for i in range(self.num_convs - 1):
+ convs.append(
+ TFData2VecVisionConvModule(
+ in_channels=self.channels,
+ out_channels=self.channels,
+ kernel_size=kernel_size,
+ padding="same",
+ dilation=dilation,
+ name=f"conv_module_{i + 2}",
+ )
+ )
+ if self.num_convs == 0:
+ self.convs = [tf.identity]
+ else:
+ self.convs = convs
+ if self.concat_input:
+ self.conv_cat = TFData2VecVisionConvModule(
+ self.in_channels + self.channels,
+ out_channels=self.channels,
+ kernel_size=kernel_size,
+ padding="same",
+ name="conv_cat",
+ )
+
+ self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
+
+ def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor:
+ # just take the relevant feature maps
+ hidden_states = encoder_hidden_states[self.in_index]
+ output = hidden_states
+ for layer_module in self.convs:
+ output = layer_module(output)
+ if self.concat_input:
+ output = self.conv_cat(tf.concat([hidden_states, output], axis=-1))
+ output = self.classifier(output)
+ return output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, None, self.channels])
+ if getattr(self, "conv_cat", None) is not None:
+ with tf.name_scope(self.conv_cat.name):
+ self.conv_cat.build(None)
+
+
+@add_start_docstrings(
+ """
+ Data2VecVision Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
+ """,
+ DATA2VEC_VISION_START_DOCSTRING,
+)
+class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
+ def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs) -> None:
+ super().__init__(config, *inputs, **kwargs)
+ self.num_labels = config.num_labels
+ self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=False, name="data2vec_vision")
+
+ # FPNs
+ self.fpn1 = [
+ keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"),
+ keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5),
+ keras.layers.Activation("gelu"),
+ keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"),
+ ]
+ self.fpn2 = [keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn2.0")]
+
+ self.fpn3 = tf.identity
+ self.fpn4 = keras.layers.MaxPool2D(pool_size=2, strides=2)
+
+ # Semantic segmentation head(s)
+ self.decode_head = TFData2VecVisionUperHead(config, name="decode_head")
+ self.auxiliary_head = (
+ TFData2VecVisionFCNHead(config, name="auxiliary_head") if config.use_auxiliary_head else None
+ )
+
+ def compute_loss(self, logits, auxiliary_logits, labels):
+ # upsample logits to the images' original size
+ if len(shape_list(labels)) > 3:
+ label_interp_shape = shape_list(labels)[1:-1]
+ else:
+ label_interp_shape = shape_list(labels)[-2:]
+
+ upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
+ if auxiliary_logits is not None:
+ upsampled_auxiliary_logits = tf.image.resize(auxiliary_logits, size=label_interp_shape, method="bilinear")
+ # compute weighted loss
+ loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
+
+ # Copied from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics.
+ # Utility to mask the index to ignore during computing the loss.
+ def masked_loss(real, pred):
+ mask = tf.math.logical_not(tf.math.equal(real, self.config.semantic_loss_ignore_index))
+ loss_ = loss_fct(real, pred)
+ mask = tf.cast(mask, dtype=loss_.dtype)
+ loss_ *= mask
+ reduced_masked_loss = tf.reduce_sum(loss_) / tf.reduce_sum(mask)
+ return tf.reshape(reduced_masked_loss, (1,))
+
+ main_loss = masked_loss(labels, upsampled_logits)
+ auxiliary_loss = masked_loss(labels, upsampled_auxiliary_logits)
+ loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
+
+ return loss
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ labels: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[tuple, TFSemanticSegmenterOutput]:
+ r"""
+ labels (`tf.Tensor` of shape `(batch_size, height, width)`, *optional*):
+ Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, TFData2VecVisionForSemanticSegmentation
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("facebook/data2vec-vision-base")
+ >>> model = TFData2VecVisionForSemanticSegmentation.from_pretrained("facebook/data2vec-vision-base")
+
+ >>> inputs = image_processor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> # logits are of shape (batch_size, num_labels, height, width)
+ >>> logits = outputs.logits
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+
+ if labels is not None and self.config.num_labels == 1:
+ raise ValueError("The number of labels should be greater than one")
+
+ outputs = self.data2vec_vision(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=True, # we need the intermediate hidden states
+ return_dict=return_dict,
+ )
+ encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+ # only keep certain features, and reshape
+ # note that we do +1 as the encoder_hidden_states also includes the initial embeddings
+ features = [feature for idx, feature in enumerate(encoder_hidden_states) if idx + 1 in self.config.out_indices]
+ patch_resolution = self.config.image_size // self.config.patch_size
+
+ def reshape_features(x):
+ # We do it this way so TF can always infer the non-batch dims at compile time
+ x = tf.reshape(x, (-1, patch_resolution, patch_resolution, self.config.hidden_size))
+ return x
+
+ features = [reshape_features(x[:, 1:, :]) for x in features]
+
+ # apply FPNs
+ ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+ for module in ops[0]:
+ features[0] = module(features[0])
+ features[1] = ops[1][0](features[1])
+ for i in range(len(features[2:])):
+ features[i + 2] = ops[i + 2](features[i + 2])
+
+ logits = self.decode_head(features)
+ # Tranpose the logits to maintain consistency in the output formats.
+ transposed_logits = tf.transpose(logits, perm=[0, 3, 1, 2])
+
+ auxiliary_logits = None
+ if self.auxiliary_head is not None:
+ auxiliary_logits = self.auxiliary_head(features)
+
+ loss = None
+ if labels is not None:
+ loss = self.compute_loss(logits, auxiliary_logits, labels)
+
+ if not return_dict:
+ if output_hidden_states:
+ output = (logits,) + outputs[1:]
+ else:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFSemanticSegmenterOutput(
+ loss=loss,
+ logits=transposed_logits,
+ hidden_states=outputs.hidden_states if output_hidden_states else None,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "data2vec_vision", None) is not None:
+ with tf.name_scope(self.data2vec_vision.name):
+ self.data2vec_vision.build(None)
+ if getattr(self, "decode_head", None) is not None:
+ with tf.name_scope(self.decode_head.name):
+ self.decode_head.build(None)
+ if getattr(self, "auxiliary_head", None) is not None:
+ with tf.name_scope(self.auxiliary_head.name):
+ self.auxiliary_head.build(None)
+ if getattr(self, "fpn1", None) is not None:
+ with tf.name_scope(self.fpn1[0].name):
+ self.fpn1[0].build([None, None, None, self.config.hidden_size])
+ with tf.name_scope(self.fpn1[1].name):
+ self.fpn1[1].build((None, None, None, self.config.hidden_size))
+ with tf.name_scope(self.fpn1[3].name):
+ self.fpn1[3].build([None, None, None, self.config.hidden_size])
+ if getattr(self, "fpn2", None) is not None:
+ with tf.name_scope(self.fpn2[0].name):
+ self.fpn2[0].build([None, None, None, self.config.hidden_size])
+
+
+__all__ = [
+ "TFData2VecVisionForImageClassification",
+ "TFData2VecVisionForSemanticSegmentation",
+ "TFData2VecVisionModel",
+ "TFData2VecVisionPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/data2vec/modular_data2vec_audio.py b/docs/transformers/build/lib/transformers/models/data2vec/modular_data2vec_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..052f22a960fb42e1ecc379c2fb7881329785e8f4
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/data2vec/modular_data2vec_audio.py
@@ -0,0 +1,400 @@
+import math
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+ CausalLMOutput,
+ SequenceClassifierOutput,
+ TokenClassifierOutput,
+ Wav2Vec2BaseModelOutput,
+ XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+)
+from ..wav2vec2.modeling_wav2vec2 import (
+ Wav2Vec2Adapter,
+ Wav2Vec2Encoder,
+ Wav2Vec2FeatureEncoder,
+ Wav2Vec2FeatureProjection,
+ Wav2Vec2ForAudioFrameClassification,
+ Wav2Vec2ForCTC,
+ Wav2Vec2ForSequenceClassification,
+ Wav2Vec2ForXVector,
+ Wav2Vec2Model,
+ Wav2Vec2PreTrainedModel,
+ Wav2Vec2SamePadLayer,
+)
+from .configuration_data2vec_audio import Data2VecAudioConfig
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Data2VecAudioConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-audio-base-960h"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 66.95
+
+
+class Data2VecAudioConvLayer(nn.Module):
+ def __init__(self, config, layer_id=0):
+ super().__init__()
+ self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+ self.out_conv_dim = config.conv_dim[layer_id]
+
+ self.conv = nn.Conv1d(
+ self.in_conv_dim,
+ self.out_conv_dim,
+ kernel_size=config.conv_kernel[layer_id],
+ stride=config.conv_stride[layer_id],
+ bias=config.conv_bias,
+ )
+ self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+ self.activation = ACT2FN[config.feat_extract_activation]
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+
+ hidden_states = hidden_states.transpose(-2, -1)
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = hidden_states.transpose(-2, -1)
+
+ hidden_states = self.activation(hidden_states)
+ return hidden_states
+
+
+class Data2VecAudioPadLayer(Wav2Vec2SamePadLayer):
+ pass
+
+
+class Data2VecAudioPositionalConvLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.conv = nn.Conv1d(
+ config.hidden_size,
+ config.hidden_size,
+ kernel_size=config.conv_pos_kernel_size,
+ padding=config.conv_pos_kernel_size // 2,
+ groups=config.num_conv_pos_embedding_groups,
+ )
+
+ self.padding = Data2VecAudioPadLayer(config.conv_pos_kernel_size)
+ self.activation = ACT2FN[config.feat_extract_activation]
+ # no learnable parameters
+ self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+ hidden_states = self.padding(hidden_states)
+
+ hidden_states = hidden_states.transpose(1, 2)
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = hidden_states.transpose(1, 2)
+ hidden_states = self.activation(hidden_states)
+ return hidden_states
+
+
+class Data2VecAudioPositionalConvEmbedding(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.layers = nn.ModuleList(
+ [Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)]
+ )
+
+ def forward(self, hidden_states):
+ hidden_states = hidden_states.transpose(1, 2)
+ for layer in self.layers:
+ hidden_states = layer(hidden_states)
+ hidden_states = hidden_states.transpose(1, 2)
+ return hidden_states
+
+
+class Data2VecAudioFeatureEncoder(Wav2Vec2FeatureEncoder, nn.Module):
+ def __init__(self, config):
+ nn.Module.__init__()
+ self.conv_layers = nn.ModuleList(
+ [Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+ )
+ self.gradient_checkpointing = False
+ self._requires_grad = True
+
+
+class Data2VecAudioFeatureProjection(Wav2Vec2FeatureProjection):
+ pass
+
+
+class Data2VecAudioEncoder(Wav2Vec2Encoder):
+ pass
+
+
+class Data2VecAudioAdapter(Wav2Vec2Adapter):
+ pass
+
+
+class Data2VecAudioPreTrainedModel(PreTrainedModel, Wav2Vec2PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = Data2VecAudioConfig
+ base_model_prefix = "data2vec_audio"
+ main_input_name = "input_values"
+ supports_gradient_checkpointing = True
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, Data2VecAudioFeatureProjection):
+ k = math.sqrt(1 / module.projection.in_features)
+ nn.init.uniform_(module.projection.weight, a=-k, b=k)
+ nn.init.uniform_(module.projection.bias, a=-k, b=k)
+ elif isinstance(module, Data2VecAudioPositionalConvLayer):
+ nn.init.constant_(module.conv.bias, 0)
+ elif isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+ if module.bias is not None:
+ module.bias.data.zero_()
+ if module.weight is not None:
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, nn.Conv1d):
+ nn.init.kaiming_normal_(module.weight)
+
+ if module.bias is not None:
+ k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+ nn.init.uniform_(module.bias, a=-k, b=k)
+
+ def _get_adapters(self):
+ raise AttributeError("Not needed for Data2VecAudio")
+
+ def init_adapter_layers(self):
+ raise AttributeError("Not needed for Data2VecAudio")
+
+ def load_adapter(self):
+ raise AttributeError("Not needed for Data2VecAudio")
+
+
+DATA2VEC_AUDIO_START_DOCSTRING = r"""
+ Data2VecAudio was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
+ Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
+ Michael Auli.
+
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving etc.).
+
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`Data2VecAudioConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DATA2VEC_AUDIO_INPUTS_DOCSTRING = r"""
+ Args:
+ input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+ Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+ into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+ soundfile*). To prepare the array into *input_values*, the [`AutoProcessor`] should be used for padding and
+ conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+ 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+
+
+ `attention_mask` should be passed if the corresponding processor has `config.return_attention_mask ==
+ True`, which is the case for all pre-trained Data2Vec Audio models. Be aware that that even with
+ `attention_mask`, zero-padded inputs will have slightly different outputs compared to non-padded inputs
+ because there are more than one convolutional layer in the positional encodings. For a more detailed
+ explanation, see [here](https://github.com/huggingface/transformers/issues/25621#issuecomment-1713759349).
+
+
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+Data2VecAudioBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+ "The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioModel(Data2VecAudioPreTrainedModel, Wav2Vec2Model):
+ def __init__(self, config: Data2VecAudioConfig):
+ Data2VecAudioPreTrainedModel.__init__(config)
+ self.config = config
+ self.feature_extractor = Data2VecAudioFeatureEncoder(config)
+ self.feature_projection = Data2VecAudioFeatureProjection(config)
+
+ # model only needs masking vector if mask prob is > 0.0
+ if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+ self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+ self.encoder = Data2VecAudioEncoder(config)
+
+ self.adapter = Data2VecAudioAdapter(config) if config.add_adapter else None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def freeze_feature_extractor(self):
+ raise AttributeError("Not needed for Data2VecAudio")
+
+ def freeze_feature_encoder(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ self.feature_extractor._freeze_parameters()
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=Data2VecAudioBaseModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="audio",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def forward(self, **super_kwargs):
+ return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+ """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel, Wav2Vec2ForCTC):
+ def __init__(self, config):
+ Data2VecAudioPreTrainedModel.__init__(config)
+
+ self.data2vec_audio = Data2VecAudioModel(config)
+ self.dropout = nn.Dropout(config.final_dropout)
+
+ if config.vocab_size is None:
+ raise ValueError(
+ f"You are trying to instantiate {self.__class__} with a configuration that "
+ "does not define the vocabulary size of the language model head. Please "
+ "instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+ "or define `vocab_size` of your model's configuration."
+ )
+ output_hidden_size = (
+ config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+ )
+ self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def freeze_base_model(self):
+ raise AttributeError("Not needed for Data2VecAudio")
+
+ def tie_weights(self):
+ raise AttributeError("Not needed for Data2VecAudio")
+
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=CausalLMOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_CTC_EXPECTED_OUTPUT,
+ expected_loss=_CTC_EXPECTED_LOSS,
+ )
+ def forward(self, **super_kwargs):
+ return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+ """
+ Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
+ like SUPERB Keyword Spotting.
+ """,
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForSequenceClassification(Wav2Vec2ForSequenceClassification):
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=SequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="audio",
+ )
+ def forward(self, **super_kwargs):
+ return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+ """
+ Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
+ """,
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification):
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TokenClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="audio",
+ )
+ def forward(self, **super_kwargs):
+ return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+ """
+ Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+ """,
+ DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForXVector(Wav2Vec2ForXVector):
+ @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=XVectorOutput,
+ config_class=_CONFIG_FOR_DOC,
+ modality="audio",
+ )
+ def forward(self, **super_kwargs):
+ return super().forward(**super_kwargs)
+
+
+__all__ = [
+ "Data2VecAudioForAudioFrameClassification",
+ "Data2VecAudioForCTC",
+ "Data2VecAudioForSequenceClassification",
+ "Data2VecAudioForXVector",
+ "Data2VecAudioModel",
+ "Data2VecAudioPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/dbrx/__init__.py b/docs/transformers/build/lib/transformers/models/dbrx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cce0f34c778d21a81d03940e7a7951707d898c86
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dbrx/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_dbrx import *
+ from .modeling_dbrx import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/dbrx/configuration_dbrx.py b/docs/transformers/build/lib/transformers/models/dbrx/configuration_dbrx.py
new file mode 100644
index 0000000000000000000000000000000000000000..72df1fe335bab446fa1e88186d2ff14ae70f3351
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dbrx/configuration_dbrx.py
@@ -0,0 +1,232 @@
+# coding=utf-8
+# Copyright 2024 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DBRX model configuration"""
+
+from typing import Any, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+ """Configuration class for Dbrx Attention.
+
+ [`DbrxAttention`] class. It is used to instantiate attention layers
+ according to the specified arguments, defining the layers architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ attn_pdrop (`float`, *optional*, defaults to 0.0):
+ The dropout probability for the attention layers.
+ clip_qkv (`float`, *optional*):
+ If set, clip the queries, keys, and values in the attention layer to this value.
+ kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
+ rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
+ """
+
+ base_config_key = "attn_config"
+
+ def __init__(
+ self,
+ attn_pdrop: float = 0.0,
+ clip_qkv: Optional[float] = None,
+ kv_n_heads: int = 1,
+ rope_theta: float = 10000.0,
+ **kwargs: Any,
+ ):
+ super().__init__(**kwargs)
+ self.attn_pdrop = attn_pdrop
+ self.clip_qkv = clip_qkv
+ self.kv_n_heads = kv_n_heads
+ self.rope_theta = rope_theta
+
+ for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash", "torch_dtype"]:
+ if k in kwargs:
+ kwargs.pop(k)
+ if len(kwargs) != 0:
+ raise ValueError(f"Found unknown {kwargs=}")
+
+
+class DbrxFFNConfig(PretrainedConfig):
+ """Configuration class for Dbrx FFN.
+
+ [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+ the specified arguments, defining the layers architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
+ The dict should have a key 'name' with the value being the name of the activation function along with
+ any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
+ ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
+ moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
+ moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
+ moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
+ moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
+ moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
+ """
+
+ base_config_key = "ffn_config"
+
+ def __init__(
+ self,
+ ffn_act_fn: dict = None,
+ ffn_hidden_size: int = 3584,
+ moe_num_experts: int = 4,
+ moe_top_k: int = 1,
+ moe_jitter_eps: Optional[float] = None,
+ moe_loss_weight: float = 0.01,
+ moe_normalize_expert_weights: Optional[float] = 1.0,
+ **kwargs: Any,
+ ):
+ super().__init__()
+ if ffn_act_fn is None:
+ ffn_act_fn = {"name": "silu"}
+ self.ffn_act_fn = ffn_act_fn
+ self.ffn_hidden_size = ffn_hidden_size
+ self.moe_num_experts = moe_num_experts
+ self.moe_top_k = moe_top_k
+ self.moe_jitter_eps = moe_jitter_eps
+ self.moe_loss_weight = moe_loss_weight
+ self.moe_normalize_expert_weights = moe_normalize_expert_weights
+
+ for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash", "torch_dtype"]:
+ if k in kwargs:
+ kwargs.pop(k)
+ if len(kwargs) != 0:
+ raise ValueError(f"Found unknown {kwargs=}")
+
+
+class DbrxConfig(PretrainedConfig):
+ r"""
+
+ This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+ specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a different configuration to that of the [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ d_model (`int`, *optional*, defaults to 2048):
+ Dimensionality of the embeddings and hidden states.
+ n_heads (`int`, *optional*, defaults to 16):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ n_layers (`int`, *optional*, defaults to 24):
+ Number of hidden layers in the Transformer encoder.
+ max_seq_len (`int`, *optional*, defaults to 2048):
+ The maximum sequence length of the model.
+ vocab_size (`int`, *optional*, defaults to 32000):
+ Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`DbrxModel`].
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
+ The dropout probability applied to the attention output before combining with residual.
+ emb_pdrop (`float`, *optional*, defaults to 0.0):
+ The dropout probability for the embedding layer.
+ attn_config (`dict`, *optional*):
+ A dictionary used to configure the model's attention module.
+ ffn_config (`dict`, *optional*):
+ A dictionary used to configure the model's FFN module.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models).
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ output_router_logits (`bool`, *optional*, defaults to `False`):
+ Whether or not the router logits should be returned by the model. Enabling this will also
+ allow the model to output the auxiliary loss. See [here]() for more details.
+
+
+ Example:
+ ```python
+ >>> from transformers import DbrxConfig, DbrxModel
+
+ >>> # Initializing a Dbrx configuration
+ >>> configuration = DbrxConfig(n_layers=2, d_model=256, n_heads=8, vocab_size=128)
+
+ >>> # Initializing a model (with random weights) from the configuration
+ >>> model = DbrxModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```
+ """
+
+ model_type = "dbrx"
+ sub_configs = {"attn_config": DbrxAttentionConfig, "ffn_config": DbrxFFNConfig}
+ attribute_map = {
+ "num_attention_heads": "n_heads",
+ "hidden_size": "d_model",
+ "num_hidden_layers": "n_layers",
+ "max_position_embeddings": "max_seq_len",
+ }
+
+ def __init__(
+ self,
+ d_model: int = 2048,
+ n_heads: int = 16,
+ n_layers: int = 24,
+ max_seq_len: int = 2048,
+ vocab_size: int = 32000,
+ resid_pdrop: float = 0.0,
+ emb_pdrop: float = 0.0,
+ attn_config: Optional[DbrxAttentionConfig] = None,
+ ffn_config: Optional[DbrxFFNConfig] = None,
+ use_cache: bool = True,
+ initializer_range: float = 0.02,
+ output_router_logits: bool = False,
+ **kwargs: Any,
+ ):
+ if attn_config is None:
+ self.attn_config = DbrxAttentionConfig()
+ elif isinstance(attn_config, dict):
+ self.attn_config = DbrxAttentionConfig(**attn_config)
+ else:
+ self.attn_config = attn_config
+
+ if ffn_config is None:
+ self.ffn_config = DbrxFFNConfig()
+ elif isinstance(ffn_config, dict):
+ self.ffn_config = DbrxFFNConfig(**ffn_config)
+ else:
+ self.ffn_config = ffn_config
+
+ self.d_model = d_model
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.max_seq_len = max_seq_len
+ self.vocab_size = vocab_size
+ self.resid_pdrop = resid_pdrop
+ self.emb_pdrop = emb_pdrop
+ self.use_cache = use_cache
+ self.initializer_range = initializer_range
+ self.output_router_logits = output_router_logits
+ self.num_key_value_heads = self.attn_config.kv_n_heads
+
+ tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+ if tie_word_embeddings:
+ raise ValueError("tie_word_embeddings is not supported for DBRX models.")
+
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["DbrxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/dbrx/modeling_dbrx.py b/docs/transformers/build/lib/transformers/models/dbrx/modeling_dbrx.py
new file mode 100644
index 0000000000000000000000000000000000000000..729e877cc8f3baf1fbe476f55c7a0a312003674e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/dbrx/modeling_dbrx.py
@@ -0,0 +1,1392 @@
+# coding=utf-8
+# Copyright 2024 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DBRX model."""
+
+import math
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_torch_flex_attn_available,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_dbrx import DbrxConfig
+
+
+if is_torch_flex_attn_available():
+ from torch.nn.attention.flex_attention import BlockMask
+
+ from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
+ from ...modeling_flash_attention_utils import _flash_attention_forward
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DbrxConfig"
+
+
+class DbrxRotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+ self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+
+ @torch.no_grad()
+ def forward(self, x, position_ids, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ self.inv_freq.to(x.device)
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+ position_ids_expanded = position_ids[:, None, :].float()
+ # Force float32 since bfloat16 loses precision on long contexts
+ # See https://github.com/huggingface/transformers/pull/29285
+ device_type = x.device.type
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False):
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos()
+ sin = emb.sin()
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`, *optional*):
+ Deprecated and unused.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def load_balancing_loss_func(
+ gate_logits: torch.Tensor,
+ num_experts: int,
+ top_k: int,
+ attention_mask: Optional[torch.Tensor],
+) -> torch.Tensor:
+ r"""Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+ See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+ function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+ experts is too unbalanced.
+
+ Args:
+ gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
+ Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+ shape [batch_size X sequence_length, num_experts].
+ num_experts (`int`):
+ Number of experts.
+ top_k (`int`):
+ The number of experts each token is routed to.
+ attention_mask (`torch.Tensor`, *optional*):
+ The attention_mask used in forward function
+ shape [batch_size X sequence_length] if not None.
+
+ Returns:
+ The auxiliary loss.
+ """
+ if gate_logits is None or not isinstance(gate_logits, tuple):
+ return torch.tensor(0.0)
+
+ if isinstance(gate_logits, tuple):
+ compute_device = gate_logits[0].device
+ concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+ routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+ _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+ expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+ if attention_mask is None:
+ # Compute the percentage of tokens routed to each experts
+ tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+ # Compute the average probability of routing to these experts
+ router_prob_per_expert = torch.mean(routing_weights, dim=0)
+ else:
+ batch_size, sequence_length = attention_mask.shape
+ num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+ # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+ expert_attention_mask = (
+ attention_mask[None, :, :, None, None]
+ .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+ .reshape(-1, top_k, num_experts)
+ .to(compute_device)
+ )
+
+ # Compute the percentage of tokens routed to each experts
+ tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+ expert_attention_mask, dim=0
+ )
+
+ # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+ router_per_expert_attention_mask = (
+ attention_mask[None, :, :, None]
+ .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+ .reshape(-1, num_experts)
+ .to(compute_device)
+ )
+
+ # Compute the average probability of routing to these experts
+ router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+ router_per_expert_attention_mask, dim=0
+ )
+
+ overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+ return overall_loss * num_experts
+
+
+class DbrxAttention(nn.Module):
+ """Multi-head self attention."""
+
+ def __init__(self, config: DbrxConfig, block_idx: Optional[int] = None):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.d_model
+ self.num_heads = config.n_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.max_position_embeddings = config.max_seq_len
+ self.block_idx = block_idx
+ if block_idx is None:
+ logger.warning_once(
+ f"Instantiating {self.__class__.__name__} without passing a `block_idx` is not recommended and will "
+ + "lead to errors during the forward call if caching is used. Please make sure to provide a `block_idx` "
+ + "when creating this class."
+ )
+
+ attn_config = config.attn_config
+ self.attn_pdrop = attn_config.attn_pdrop
+ self.clip_qkv = attn_config.clip_qkv
+ self.num_key_value_heads = attn_config.kv_n_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.rope_theta = attn_config.rope_theta
+ self.is_causal = True
+
+ self.Wqkv = nn.Linear(
+ self.hidden_size, self.hidden_size + 2 * self.num_key_value_heads * self.head_dim, bias=False
+ )
+ self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+ self.rotary_emb = DbrxRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_ids: torch.LongTensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Any,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.Wqkv(hidden_states)
+ min_val = -self.clip_qkv if self.clip_qkv is not None else None
+ max_val = self.clip_qkv
+ qkv_states = qkv_states.clamp(min=min_val, max=max_val)
+
+ query_states, key_states, value_states = qkv_states.split(
+ [
+ self.hidden_size,
+ self.num_key_value_heads * self.head_dim,
+ self.num_key_value_heads * self.head_dim,
+ ],
+ dim=2,
+ )
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; position_ids needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.block_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attention_mask is not None: # no matter the length, we just slice it
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+ attn_weights = attn_weights + causal_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attn_pdrop, training=self.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ + f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+ attn_output = self.out_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class DbrxFlashAttention2(DbrxAttention):
+ """Dbrx flash attention module.
+
+ This module inherits from `DbrxAttention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it
+ calls the public API of flash attention.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+ self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Any,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if isinstance(past_key_value, StaticCache):
+ raise ValueError(
+ "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+ "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+ )
+ logger.info("Implicitly setting `output_attentions` to False as it is not supported in Flash Attention.")
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.Wqkv(hidden_states)
+ if self.clip_qkv is not None:
+ qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+ query_states, key_states, value_states = qkv_states.split(
+ [
+ self.hidden_size,
+ self.num_key_value_heads * self.head_dim,
+ self.num_key_value_heads * self.head_dim,
+ ],
+ dim=2,
+ )
+
+ # Flash attention requires the input to have the shape
+ # batch_size x seq_length x head_dim x hidden_dim
+ # therefore we just need to keep the original shape
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.block_idx, cache_kwargs)
+
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout
+ # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+ # to be able to avoid many of these transpose/reshape/view.
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ dropout_rate = self.attn_pdrop if self.training else 0.0
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in the correct dtype just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32. (LlamaRMSNorm handles it correctly)
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, "_pre_quantization_dtype"):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = query_states.dtype
+
+ logger.warning_once(
+ "The input hidden states seems to be silently casted in float32, this might be "
+ + "related to the fact you have upcasted embedding or layer norm layers in "
+ + f"float32. We will cast back the input in {target_dtype}."
+ )
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ attn_output = _flash_attention_forward(
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ q_len,
+ position_ids=position_ids,
+ dropout=dropout_rate,
+ is_causal=self.is_causal,
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
+ )
+
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+ attn_output = self.out_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class DbrxSdpaAttention(DbrxAttention):
+ """
+ Dbrx attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+ `DbrxAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+ SDPA API.
+ """
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if output_attentions:
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+ logger.warning_once(
+ "DbrxModel is using DbrxSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ return super().forward(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.Wqkv(hidden_states)
+ if self.clip_qkv is not None:
+ qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+ query_states, key_states, value_states = qkv_states.split(
+ [
+ self.hidden_size,
+ self.num_key_value_heads * self.head_dim,
+ self.num_key_value_heads * self.head_dim,
+ ],
+ dim=2,
+ )
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.block_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ causal_mask = attention_mask
+ if attention_mask is not None:
+ causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
+ if query_states.device.type == "cuda" and causal_mask is not None:
+ query_states = query_states.contiguous()
+ key_states = key_states.contiguous()
+ value_states = value_states.contiguous()
+
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ is_causal = True if causal_mask is None and q_len > 1 else False
+
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
+ query_states,
+ key_states,
+ value_states,
+ attn_mask=causal_mask,
+ dropout_p=self.attn_pdrop if self.training else 0.0,
+ is_causal=is_causal,
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.view(bsz, q_len, -1)
+
+ attn_output = self.out_proj(attn_output)
+
+ return attn_output, None, past_key_value
+
+
+DBRX_ATTENTION_CLASSES = {
+ "eager": DbrxAttention,
+ "flash_attention_2": DbrxFlashAttention2,
+ "sdpa": DbrxSdpaAttention,
+}
+
+
+class DbrxNormAttentionNorm(nn.Module):
+ def __init__(self, config: DbrxConfig, block_idx: Optional[int] = None):
+ super().__init__()
+ self.block_idx = block_idx
+ self.resid_pdrop = config.resid_pdrop
+ self.norm_1 = nn.LayerNorm(config.d_model, bias=False)
+ self.attn = DBRX_ATTENTION_CLASSES[config._attn_implementation](
+ config=config,
+ block_idx=block_idx,
+ )
+ self.norm_2 = nn.LayerNorm(config.d_model, bias=False)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_ids: torch.LongTensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Any,
+ ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+ residual_states = hidden_states
+ hidden_states = self.norm_1(hidden_states).to(hidden_states.dtype)
+
+ hidden_states, attn_weights, past_key_value = self.attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.resid_pdrop, training=self.training)
+ hidden_states = hidden_states + residual_states
+
+ residual_states = hidden_states
+ hidden_states = self.norm_2(hidden_states).to(hidden_states.dtype)
+
+ return residual_states, hidden_states, attn_weights, past_key_value
+
+
+class DbrxRouter(nn.Module):
+ def __init__(
+ self,
+ hidden_size: int,
+ moe_num_experts: int,
+ moe_top_k: int,
+ moe_jitter_eps: Optional[float],
+ moe_normalize_expert_weights: Optional[float],
+ ):
+ super().__init__()
+ self.hidden_size = hidden_size
+ self.moe_num_experts = moe_num_experts
+ self.moe_top_k = moe_top_k
+ self.moe_jitter_eps = moe_jitter_eps
+ self.moe_normalize_expert_weights = moe_normalize_expert_weights
+
+ self.layer = nn.Linear(self.hidden_size, self.moe_num_experts, bias=False)
+
+ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
+ if self.training and self.moe_jitter_eps is not None:
+ hidden_states *= torch.empty_like(hidden_states).uniform_(
+ 1.0 - self.moe_jitter_eps, 1.0 + self.moe_jitter_eps
+ )
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+ weights = self.layer(hidden_states).softmax(dim=-1, dtype=torch.float32)
+ top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
+
+ top_weights_scale = (
+ torch.norm(top_weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True)
+ if self.moe_normalize_expert_weights is not None
+ else 1.0
+ )
+ top_weights = top_weights / top_weights_scale
+
+ weights = weights.to(hidden_states.dtype)
+ top_weights = top_weights.to(hidden_states.dtype)
+ return weights, top_weights, top_experts
+
+
+class DbrxExpertGLU(nn.Module):
+ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict):
+ super().__init__()
+ self.hidden_size = hidden_size
+ self.ffn_hidden_size = ffn_hidden_size
+ self.moe_num_experts = moe_num_experts
+
+ self.w1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+ self.v1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+ self.w2 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+
+ act_fn_name = ffn_act_fn.get("name", "silu")
+ self.activation_fn = ACT2FN[act_fn_name]
+
+ def forward(
+ self, x: torch.Tensor, expert_w1: torch.Tensor, expert_v1: torch.Tensor, expert_w2: torch.Tensor
+ ) -> torch.Tensor:
+ gate_proj = x.matmul(expert_w1.t())
+ up_proj = x.matmul(expert_v1.t())
+ gate_proj = self.activation_fn(gate_proj)
+ intermediate_states = gate_proj * up_proj
+ down_proj = intermediate_states.matmul(expert_w2)
+ return down_proj
+
+
+class DbrxExperts(nn.Module):
+ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict):
+ super().__init__()
+ self.moe_num_experts = moe_num_experts
+ self.mlp = DbrxExpertGLU(
+ hidden_size=hidden_size,
+ ffn_hidden_size=ffn_hidden_size,
+ moe_num_experts=moe_num_experts,
+ ffn_act_fn=ffn_act_fn,
+ )
+
+ def forward(
+ self, x: torch.Tensor, weights: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor
+ ) -> torch.Tensor:
+ bsz, q_len, hidden_size = x.shape
+ x = x.view(-1, hidden_size)
+ out = torch.zeros_like(x)
+
+ expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
+ # Chunk experts at once to avoid storing full parameter multiple times in autograd
+ w1_chunked = self.mlp.w1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(
+ self.moe_num_experts, dim=0
+ )
+ v1_chunked = self.mlp.v1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(
+ self.moe_num_experts, dim=0
+ )
+ w2_chunked = self.mlp.w2.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(
+ self.moe_num_experts, dim=0
+ )
+ w1_chunked = [w1.squeeze(dim=0) for w1 in w1_chunked]
+ v1_chunked = [v1.squeeze(dim=0) for v1 in v1_chunked]
+ w2_chunked = [w2.squeeze(dim=0) for w2 in w2_chunked]
+ for expert_idx in range(0, self.moe_num_experts):
+ # (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: dynamic shape operator: aten.nonzero.default`)
+ # (set torch._dynamo.config.capture_dynamic_output_shape_ops = True may help but not tested)
+ topk_idx, token_idx = torch.where(expert_mask[expert_idx])
+ if token_idx.shape[0] == 0:
+ continue
+
+ token_list = token_idx
+ topk_list = topk_idx
+
+ expert_tokens = x[None, token_list].reshape(-1, hidden_size)
+ expert_out = (
+ self.mlp(expert_tokens, w1_chunked[expert_idx], v1_chunked[expert_idx], w2_chunked[expert_idx])
+ * top_weights[token_list, topk_list, None]
+ )
+
+ out.index_add_(0, token_idx, expert_out)
+
+ out = out.reshape(bsz, q_len, hidden_size)
+ return out
+
+
+class DbrxFFN(nn.Module):
+ def __init__(self, config: DbrxConfig):
+ super().__init__()
+
+ ffn_config = config.ffn_config
+ self.router = DbrxRouter(
+ hidden_size=config.d_model,
+ moe_num_experts=ffn_config.moe_num_experts,
+ moe_top_k=ffn_config.moe_top_k,
+ moe_jitter_eps=ffn_config.moe_jitter_eps,
+ moe_normalize_expert_weights=ffn_config.moe_normalize_expert_weights,
+ )
+
+ self.experts = DbrxExperts(
+ hidden_size=config.d_model,
+ ffn_hidden_size=ffn_config.ffn_hidden_size,
+ moe_num_experts=ffn_config.moe_num_experts,
+ ffn_act_fn=ffn_config.ffn_act_fn,
+ )
+
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ weights, top_weights, top_experts = self.router(x)
+ out = self.experts(x, weights, top_weights, top_experts)
+ return out, weights
+
+
+class DbrxBlock(nn.Module):
+ def __init__(self, config: DbrxConfig, block_idx: int):
+ super().__init__()
+ self.hidden_size = config.d_model
+ self.resid_pdrop = config.resid_pdrop
+ self.block_idx = block_idx
+ self.norm_attn_norm = DbrxNormAttentionNorm(
+ config=config,
+ block_idx=block_idx,
+ )
+ self.ffn = DbrxFFN(config=config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: Optional[bool] = False,
+ output_router_logits: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Any,
+ ) -> Union[
+ Tuple[torch.Tensor],
+ Tuple[torch.Tensor, Optional[torch.Tensor]],
+ Tuple[torch.Tensor, Optional[Cache]],
+ Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]],
+ Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]],
+ Tuple[torch.Tensor, Optional[Cache], Optional[torch.Tensor]],
+ Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache], Optional[torch.Tensor]],
+ ]:
+ """Forward function for DbrxBlock.
+
+ Args:
+ hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ position_ids (`torch.LongTensor`): position ids of shape `(batch, seq_len)`
+ attention_mask (`torch.Tensor`, *optional*): attention mask of size (batch_size, sequence_length)
+ if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length)
+ if default attention is used.
+ past_key_value (`Tuple(torch.Tensor)`, *optional*): cached past key and value projection states
+ output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all
+ attention layers. See `attentions` under returned tensors for more detail.
+ output_router_logits (`bool`, *optional*): Whether or not to return the router logits.
+ use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are
+ returned and can be used to speed up decoding (see `past_key_values`).
+ cache_position (`torch.LongTensor`, *optional*): position ids of the cache
+ """
+
+ # Norm + Attention + Norm
+ resid_states, hidden_states, self_attn_weights, present_key_value = self.norm_attn_norm(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+
+ # Fully Connected
+ hidden_states, router_logits = self.ffn(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.resid_pdrop, training=self.training)
+ hidden_states = resid_states + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ if output_router_logits:
+ outputs += (router_logits,)
+
+ return outputs
+
+
+DBRX_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`DbrxConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+ "The bare DBRX Model outputting raw hidden-states without any specific head on top.",
+ DBRX_START_DOCSTRING,
+)
+class DbrxPreTrainedModel(PreTrainedModel):
+ config_class = DbrxConfig
+ base_model_prefix = "transformer"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["DbrxBlock"]
+ _skip_keys_device_placement = ["past_key_values"]
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+ _supports_cache_class = True
+ _supports_quantized_cache = True
+ _supports_static_cache = False # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+
+ def _init_weights(self, module: nn.Module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.weight.data.fill_(1.0)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, DbrxExpertGLU):
+ module.w1.data.normal_(mean=0.0, std=std)
+ module.v1.data.normal_(mean=0.0, std=std)
+ module.w2.data.normal_(mean=0.0, std=std)
+
+
+DBRX_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ Two formats are allowed:
+ - a [`~cache_utils.Cache`] instance, see our
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+ cache format.
+
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+ legacy cache format will be returned.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ output_router_logits (`bool`, *optional*):
+ Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+ should not be returned during inference.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+ the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+ "The bare DBRX Model outputting raw hidden-states without any specific head on top.",
+ DBRX_START_DOCSTRING,
+)
+class DbrxModel(DbrxPreTrainedModel):
+ """Transformer decoder consisting of *config.num_hidden_layers*. Each layer is a [`DbrxBlock`] layer.
+
+ Args:
+ config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+ """
+
+ def __init__(self, config: DbrxConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.emb_pdrop = config.emb_pdrop
+
+ self.wte = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+ self.blocks = nn.ModuleList([DbrxBlock(config, block_idx) for block_idx in range(config.n_layers)])
+ self.norm_f = nn.LayerNorm(config.d_model, bias=False)
+ self.gradient_checkpointing = False
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Embedding:
+ return self.wte
+
+ def set_input_embeddings(self, value: nn.Embedding):
+ self.wte = value
+
+ @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_router_logits: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs, # NOOP kwargs, for now
+ ) -> Union[Tuple, MoeModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ output_router_logits = (
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+ if self.gradient_checkpointing and self.training and use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+ )
+ use_cache = False
+
+ if inputs_embeds is None:
+ inputs_embeds = self.wte(input_ids)
+
+ inputs_embeds = nn.functional.dropout(inputs_embeds, p=self.emb_pdrop, training=self.training)
+
+ # kept for BC (non `Cache` `past_key_values` inputs)
+ return_legacy_cache = False
+ if use_cache and not isinstance(past_key_values, Cache):
+ return_legacy_cache = True
+ if past_key_values is None:
+ past_key_values = DynamicCache()
+ else:
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+ logger.warning_once(
+ "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+ "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+ "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+ )
+
+ if cache_position is None:
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ cache_position = torch.arange(
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+ )
+
+ if position_ids is None:
+ position_ids = cache_position.unsqueeze(0)
+
+ causal_mask = self._update_causal_mask(
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+ )
+
+ # embed positions
+ hidden_states = inputs_embeds
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ all_router_logits = () if output_router_logits else None
+ next_decoder_cache = None
+
+ for block in self.blocks:
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ block_outputs = self._gradient_checkpointing_func(
+ block.__call__,
+ hidden_states,
+ causal_mask,
+ position_ids,
+ past_key_values,
+ output_attentions,
+ output_router_logits,
+ use_cache,
+ cache_position,
+ )
+ else:
+ block_outputs = block(
+ hidden_states,
+ attention_mask=causal_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ output_attentions=output_attentions,
+ output_router_logits=output_router_logits,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ )
+
+ hidden_states = block_outputs[0]
+
+ if use_cache:
+ next_decoder_cache = block_outputs[2 if output_attentions else 1]
+
+ if output_attentions:
+ all_self_attns += (block_outputs[1],)
+
+ if output_router_logits:
+ all_router_logits += (block_outputs[-1],)
+
+ hidden_states = self.norm_f(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if return_legacy_cache:
+ next_cache = next_cache.to_legacy_cache()
+
+ if not return_dict:
+ return tuple(
+ v
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+ if v is not None
+ )
+ return MoeModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ router_logits=all_router_logits,
+ )
+
+ # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+ def _update_causal_mask(
+ self,
+ attention_mask: Union[torch.Tensor, "BlockMask"],
+ input_tensor: torch.Tensor,
+ cache_position: torch.Tensor,
+ past_key_values: Cache,
+ output_attentions: bool = False,
+ ):
+ if self.config._attn_implementation == "flash_attention_2":
+ if attention_mask is not None and (attention_mask == 0.0).any():
+ return attention_mask
+ return None
+ if self.config._attn_implementation == "flex_attention":
+ if isinstance(attention_mask, torch.Tensor):
+ attention_mask = make_flex_block_causal_mask(attention_mask)
+ return attention_mask
+
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+ # to infer the attention mask.
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ using_static_cache = isinstance(past_key_values, StaticCache)
+
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+ if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
+ attention_mask,
+ inputs_embeds=input_tensor,
+ past_key_values_length=past_seen_tokens,
+ is_training=self.training,
+ ):
+ return None
+
+ dtype, device = input_tensor.dtype, input_tensor.device
+ sequence_length = input_tensor.shape[1]
+ if using_static_cache:
+ target_length = past_key_values.get_max_cache_shape()
+ else:
+ target_length = (
+ attention_mask.shape[-1]
+ if isinstance(attention_mask, torch.Tensor)
+ else past_seen_tokens + sequence_length + 1
+ )
+
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask,
+ sequence_length=sequence_length,
+ target_length=target_length,
+ dtype=dtype,
+ device=device,
+ cache_position=cache_position,
+ batch_size=input_tensor.shape[0],
+ )
+
+ if (
+ self.config._attn_implementation == "sdpa"
+ and attention_mask is not None
+ and attention_mask.device.type in ["cuda", "xpu", "npu"]
+ and not output_attentions
+ ):
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+ # Details: https://github.com/pytorch/pytorch/issues/110213
+ min_dtype = torch.finfo(dtype).min
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+ return causal_mask
+
+ @staticmethod
+ # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position
+ def _prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask: torch.Tensor,
+ sequence_length: int,
+ target_length: int,
+ dtype: torch.dtype,
+ device: torch.device,
+ cache_position: torch.Tensor,
+ batch_size: int,
+ **kwargs,
+ ):
+ """
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+ Args:
+ attention_mask (`torch.Tensor`):
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+ `(batch_size, 1, query_length, key_value_length)`.
+ sequence_length (`int`):
+ The sequence length being processed.
+ target_length (`int`):
+ The target length: when generating with static cache, the mask should be as long as the static cache,
+ to account for the 0 padding, the part of the cache that is not filled yet.
+ dtype (`torch.dtype`):
+ The dtype to use for the 4D attention mask.
+ device (`torch.device`):
+ The device to place the 4D attention mask on.
+ cache_position (`torch.Tensor`):
+ Indices depicting the position of the input sequence tokens in the sequence.
+ batch_size (`torch.Tensor`):
+ Batch size.
+ """
+ if attention_mask is not None and attention_mask.dim() == 4:
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+ causal_mask = attention_mask
+ else:
+ min_dtype = torch.finfo(dtype).min
+ causal_mask = torch.full(
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+ )
+ if sequence_length != 1:
+ causal_mask = torch.triu(causal_mask, diagonal=1)
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+ if attention_mask is not None:
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
+ mask_length = attention_mask.shape[-1]
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+ causal_mask.device
+ )
+ padding_mask = padding_mask == 0
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+ padding_mask, min_dtype
+ )
+
+ return causal_mask
+
+
+@add_start_docstrings("The DBRX Model transformer for causal language modeling.", DBRX_START_DOCSTRING)
+class DbrxForCausalLM(DbrxPreTrainedModel, GenerationMixin):
+ def __init__(self, config: DbrxConfig):
+ super().__init__(config)
+ self.transformer = DbrxModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+ self.moe_loss_weight = config.ffn_config.moe_loss_weight
+ self.num_experts = config.ffn_config.moe_num_experts
+ self.num_experts_per_tok = config.ffn_config.moe_top_k
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Embedding:
+ return self.transformer.get_input_embeddings()
+
+ def set_input_embeddings(self, value: nn.Embedding):
+ self.transformer.set_input_embeddings(value)
+
+ def get_output_embeddings(self) -> nn.Linear:
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings: nn.Linear):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder: DbrxModel):
+ self.transformer = decoder
+
+ def get_decoder(self) -> DbrxModel:
+ return self.transformer
+
+ @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_router_logits: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ **kwargs,
+ ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+ Returns:
+
+ Example:
+
+ ```python
+ >> from transformers import AutoTokenizer, DbrxForCausalLM
+
+ >> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
+ >> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct")
+
+ >> prompt = "Hey, are you conscious? Can you talk to me?"
+ >> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >> # Generate
+ >> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ output_router_logits = (
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.transformer(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ output_router_logits=output_router_logits,
+ return_dict=return_dict,
+ cache_position=cache_position,
+ )
+
+ hidden_states = outputs[0]
+ # No upscaling to float was ever done for Dbrx
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(
+ logits,
+ labels,
+ vocab_size=self.config.vocab_size,
+ **kwargs,
+ )
+
+ aux_loss = None
+ if output_router_logits:
+ aux_loss = load_balancing_loss_func(
+ outputs.router_logits if return_dict else outputs[-1],
+ self.num_experts,
+ self.num_experts_per_tok,
+ attention_mask,
+ )
+ if labels is not None and loss is not None:
+ loss += self.moe_loss_weight * aux_loss.to(loss.device) # make sure to reside in the same device
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ if output_router_logits:
+ output = (aux_loss,) + output
+ return (loss,) + output if loss is not None else output
+
+ return MoeCausalLMOutputWithPast(
+ loss=loss,
+ aux_loss=aux_loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ router_logits=outputs.router_logits,
+ )
+
+
+__all__ = ["DbrxForCausalLM", "DbrxModel", "DbrxPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/deberta/__init__.py b/docs/transformers/build/lib/transformers/models/deberta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f70972237964208b461eee8141a4f8b1377154f6
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_deberta import *
+ from .modeling_deberta import *
+ from .modeling_tf_deberta import *
+ from .tokenization_deberta import *
+ from .tokenization_deberta_fast import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/deberta/configuration_deberta.py b/docs/transformers/build/lib/transformers/models/deberta/configuration_deberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..835c16080a17efa0ea6dc5f4570a97e495fa9931
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta/configuration_deberta.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2020, Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DeBERTa model configuration"""
+
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+ from ... import FeatureExtractionMixin, PreTrainedTokenizerBase, TensorType
+
+
+logger = logging.get_logger(__name__)
+
+
+class DebertaConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DebertaModel`] or a [`TFDebertaModel`]. It is
+ used to instantiate a DeBERTa model according to the specified arguments, defining the model architecture.
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa
+ [microsoft/deberta-base](https://huggingface.co/microsoft/deberta-base) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Arguments:
+ vocab_size (`int`, *optional*, defaults to 50265):
+ Vocabulary size of the DeBERTa model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"`
+ are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+ The dropout ratio for the attention probabilities.
+ max_position_embeddings (`int`, *optional*, defaults to 512):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ type_vocab_size (`int`, *optional*, defaults to 0):
+ The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the layer normalization layers.
+ relative_attention (`bool`, *optional*, defaults to `False`):
+ Whether use relative position encoding.
+ max_relative_positions (`int`, *optional*, defaults to 1):
+ The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value
+ as `max_position_embeddings`.
+ pad_token_id (`int`, *optional*, defaults to 0):
+ The value used to pad input_ids.
+ position_biased_input (`bool`, *optional*, defaults to `True`):
+ Whether add absolute position embedding to content embedding.
+ pos_att_type (`List[str]`, *optional*):
+ The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
+ `["p2c", "c2p"]`.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the layer normalization layers.
+ legacy (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should use the legacy `LegacyDebertaOnlyMLMHead`, which does not work properly
+ for mask infilling tasks.
+
+ Example:
+
+ ```python
+ >>> from transformers import DebertaConfig, DebertaModel
+
+ >>> # Initializing a DeBERTa microsoft/deberta-base style configuration
+ >>> configuration = DebertaConfig()
+
+ >>> # Initializing a model (with random weights) from the microsoft/deberta-base style configuration
+ >>> model = DebertaModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "deberta"
+
+ def __init__(
+ self,
+ vocab_size=50265,
+ hidden_size=768,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ intermediate_size=3072,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ max_position_embeddings=512,
+ type_vocab_size=0,
+ initializer_range=0.02,
+ layer_norm_eps=1e-7,
+ relative_attention=False,
+ max_relative_positions=-1,
+ pad_token_id=0,
+ position_biased_input=True,
+ pos_att_type=None,
+ pooler_dropout=0,
+ pooler_hidden_act="gelu",
+ legacy=True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.max_position_embeddings = max_position_embeddings
+ self.type_vocab_size = type_vocab_size
+ self.initializer_range = initializer_range
+ self.relative_attention = relative_attention
+ self.max_relative_positions = max_relative_positions
+ self.pad_token_id = pad_token_id
+ self.position_biased_input = position_biased_input
+
+ # Backwards compatibility
+ if isinstance(pos_att_type, str):
+ pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")]
+
+ self.pos_att_type = pos_att_type
+ self.vocab_size = vocab_size
+ self.layer_norm_eps = layer_norm_eps
+
+ self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size)
+ self.pooler_dropout = pooler_dropout
+ self.pooler_hidden_act = pooler_hidden_act
+ self.legacy = legacy
+
+
+# Copied from transformers.models.deberta_v2.configuration_deberta_v2.DebertaV2OnnxConfig
+class DebertaOnnxConfig(OnnxConfig):
+ @property
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
+ if self.task == "multiple-choice":
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+ else:
+ dynamic_axis = {0: "batch", 1: "sequence"}
+ if self._config.type_vocab_size > 0:
+ return OrderedDict(
+ [("input_ids", dynamic_axis), ("attention_mask", dynamic_axis), ("token_type_ids", dynamic_axis)]
+ )
+ else:
+ return OrderedDict([("input_ids", dynamic_axis), ("attention_mask", dynamic_axis)])
+
+ @property
+ def default_onnx_opset(self) -> int:
+ return 12
+
+ def generate_dummy_inputs(
+ self,
+ preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
+ batch_size: int = -1,
+ seq_length: int = -1,
+ num_choices: int = -1,
+ is_pair: bool = False,
+ framework: Optional["TensorType"] = None,
+ num_channels: int = 3,
+ image_width: int = 40,
+ image_height: int = 40,
+ tokenizer: "PreTrainedTokenizerBase" = None,
+ ) -> Mapping[str, Any]:
+ dummy_inputs = super().generate_dummy_inputs(preprocessor=preprocessor, framework=framework)
+ if self._config.type_vocab_size == 0 and "token_type_ids" in dummy_inputs:
+ del dummy_inputs["token_type_ids"]
+ return dummy_inputs
+
+
+__all__ = ["DebertaConfig", "DebertaOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/deberta/modeling_deberta.py b/docs/transformers/build/lib/transformers/models/deberta/modeling_deberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..c574460513b956576edef6813c16d35207d980c5
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta/modeling_deberta.py
@@ -0,0 +1,1352 @@
+# coding=utf-8
+# Copyright 2020 Microsoft and the Hugging Face Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DeBERTa model."""
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+ BaseModelOutput,
+ MaskedLMOutput,
+ QuestionAnsweringModelOutput,
+ SequenceClassifierOutput,
+ TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_deberta import DebertaConfig
+
+
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "DebertaConfig"
+_CHECKPOINT_FOR_DOC = "microsoft/deberta-base"
+
+# Masked LM docstring
+_CHECKPOINT_FOR_MASKED_LM = "lsanochkin/deberta-large-feedback"
+_MASKED_LM_EXPECTED_OUTPUT = "' Paris'"
+_MASKED_LM_EXPECTED_LOSS = "0.54"
+
+# QuestionAnswering docstring
+_CHECKPOINT_FOR_QA = "Palak/microsoft_deberta-large_squad"
+_QA_EXPECTED_OUTPUT = "' a nice puppet'"
+_QA_EXPECTED_LOSS = 0.14
+_QA_TARGET_START_INDEX = 12
+_QA_TARGET_END_INDEX = 14
+
+
+class DebertaLayerNorm(nn.Module):
+ """LayerNorm module in the TF style (epsilon inside the square root)."""
+
+ def __init__(self, size, eps=1e-12):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(size))
+ self.bias = nn.Parameter(torch.zeros(size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_type = hidden_states.dtype
+ hidden_states = hidden_states.float()
+ mean = hidden_states.mean(-1, keepdim=True)
+ variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
+ hidden_states = (hidden_states - mean) / torch.sqrt(variance + self.variance_epsilon)
+ hidden_states = hidden_states.to(input_type)
+ y = self.weight * hidden_states + self.bias
+ return y
+
+
+class DebertaSelfOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+@torch.jit.script
+def build_relative_position(query_layer, key_layer):
+ """
+ Build relative position according to the query and key
+
+ We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+ \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+ P_k\\)
+
+ Args:
+ query_size (int): the length of query
+ key_size (int): the length of key
+
+ Return:
+ `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+
+ """
+
+ query_size = query_layer.size(-2)
+ key_size = key_layer.size(-2)
+
+ q_ids = torch.arange(query_size, dtype=torch.long, device=query_layer.device)
+ k_ids = torch.arange(key_size, dtype=torch.long, device=key_layer.device)
+ rel_pos_ids = q_ids[:, None] - k_ids.view(1, -1).repeat(query_size, 1)
+ rel_pos_ids = rel_pos_ids[:query_size, :]
+ rel_pos_ids = rel_pos_ids.unsqueeze(0)
+ return rel_pos_ids
+
+
+@torch.jit.script
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+ return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])
+
+
+@torch.jit.script
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+ return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])
+
+
+@torch.jit.script
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+ return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))
+
+
+###### To support a general trace, we have to define these operation as they use python objects (sizes) ##################
+# which are not supported by torch.jit.trace.
+# Full credits to @Szustarol
+@torch.jit.script
+def scaled_size_sqrt(query_layer: torch.Tensor, scale_factor: int):
+ return torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
+
+
+@torch.jit.script
+def build_rpos(query_layer: torch.Tensor, key_layer: torch.Tensor, relative_pos):
+ if query_layer.size(-2) != key_layer.size(-2):
+ return build_relative_position(query_layer, key_layer)
+ else:
+ return relative_pos
+
+
+@torch.jit.script
+def compute_attention_span(query_layer: torch.Tensor, key_layer: torch.Tensor, max_relative_positions: int):
+ return torch.tensor(min(max(query_layer.size(-2), key_layer.size(-2)), max_relative_positions))
+
+
+@torch.jit.script
+def uneven_size_corrected(p2c_att, query_layer: torch.Tensor, key_layer: torch.Tensor, relative_pos):
+ if query_layer.size(-2) != key_layer.size(-2):
+ pos_index = relative_pos[:, :, :, 0].unsqueeze(-1)
+ return torch.gather(p2c_att, dim=2, index=pos_dynamic_expand(pos_index, p2c_att, key_layer))
+ else:
+ return p2c_att
+
+
+########################################################################################################################
+
+
+class DisentangledSelfAttention(nn.Module):
+ """
+ Disentangled self-attention module
+
+ Parameters:
+ config (`str`):
+ A model config class instance with the configuration to build a new model. The schema is similar to
+ *BertConfig*, for more details, please refer [`DebertaConfig`]
+
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ if config.hidden_size % config.num_attention_heads != 0:
+ raise ValueError(
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+ f"heads ({config.num_attention_heads})"
+ )
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.in_proj = nn.Linear(config.hidden_size, self.all_head_size * 3, bias=False)
+ self.q_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+ self.v_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+ self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+
+ self.relative_attention = getattr(config, "relative_attention", False)
+ self.talking_head = getattr(config, "talking_head", False)
+
+ if self.talking_head:
+ self.head_logits_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False)
+ self.head_weights_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False)
+ else:
+ self.head_logits_proj = None
+ self.head_weights_proj = None
+
+ if self.relative_attention:
+ self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+ if self.max_relative_positions < 1:
+ self.max_relative_positions = config.max_position_embeddings
+ self.pos_dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ if "c2p" in self.pos_att_type:
+ self.pos_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+ if "p2c" in self.pos_att_type:
+ self.pos_q_proj = nn.Linear(config.hidden_size, self.all_head_size)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+ def transpose_for_scores(self, x):
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
+ x = x.view(new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ output_attentions: bool = False,
+ query_states: Optional[torch.Tensor] = None,
+ relative_pos: Optional[torch.Tensor] = None,
+ rel_embeddings: Optional[torch.Tensor] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+ """
+ Call the module
+
+ Args:
+ hidden_states (`torch.FloatTensor`):
+ Input states to the module usually the output from previous layer, it will be the Q,K and V in
+ *Attention(Q,K,V)*
+
+ attention_mask (`torch.BoolTensor`):
+ An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+ sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
+ th token.
+
+ output_attentions (`bool`, *optional*):
+ Whether return the attention matrix.
+
+ query_states (`torch.FloatTensor`, *optional*):
+ The *Q* state in *Attention(Q,K,V)*.
+
+ relative_pos (`torch.LongTensor`):
+ The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+ values ranging in [*-max_relative_positions*, *max_relative_positions*].
+
+ rel_embeddings (`torch.FloatTensor`):
+ The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+ \\text{max_relative_positions}\\), *hidden_size*].
+
+
+ """
+ if query_states is None:
+ qp = self.in_proj(hidden_states) # .split(self.all_head_size, dim=-1)
+ query_layer, key_layer, value_layer = self.transpose_for_scores(qp).chunk(3, dim=-1)
+ else:
+ ws = self.in_proj.weight.chunk(self.num_attention_heads * 3, dim=0)
+ qkvw = [torch.cat([ws[i * 3 + k] for i in range(self.num_attention_heads)], dim=0) for k in range(3)]
+ q = torch.matmul(qkvw[0], query_states.t().to(dtype=qkvw[0].dtype))
+ k = torch.matmul(qkvw[1], hidden_states.t().to(dtype=qkvw[1].dtype))
+ v = torch.matmul(qkvw[2], hidden_states.t().to(dtype=qkvw[2].dtype))
+ query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q, k, v]]
+
+ query_layer = query_layer + self.transpose_for_scores(self.q_bias[None, None, :])
+ value_layer = value_layer + self.transpose_for_scores(self.v_bias[None, None, :])
+
+ rel_att: int = 0
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ scale_factor = 1 + len(self.pos_att_type)
+ scale = scaled_size_sqrt(query_layer, scale_factor)
+ query_layer = query_layer / scale.to(dtype=query_layer.dtype)
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+ if self.relative_attention and rel_embeddings is not None and relative_pos is not None:
+ rel_embeddings = self.pos_dropout(rel_embeddings)
+ rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
+
+ if rel_att is not None:
+ attention_scores = attention_scores + rel_att
+
+ # bxhxlxd
+ if self.head_logits_proj is not None:
+ attention_scores = self.head_logits_proj(attention_scores.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+ attention_mask = attention_mask.bool()
+ attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min)
+ # bsz x height x length x dimension
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+ attention_probs = self.dropout(attention_probs)
+ if self.head_weights_proj is not None:
+ attention_probs = self.head_weights_proj(attention_probs.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+ context_layer = torch.matmul(attention_probs, value_layer)
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (-1,)
+ context_layer = context_layer.view(new_context_layer_shape)
+ if not output_attentions:
+ return (context_layer, None)
+ return (context_layer, attention_probs)
+
+ def disentangled_att_bias(
+ self,
+ query_layer: torch.Tensor,
+ key_layer: torch.Tensor,
+ relative_pos: torch.Tensor,
+ rel_embeddings: torch.Tensor,
+ scale_factor: int,
+ ):
+ if relative_pos is None:
+ relative_pos = build_relative_position(query_layer, key_layer, query_layer.device)
+ if relative_pos.dim() == 2:
+ relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+ elif relative_pos.dim() == 3:
+ relative_pos = relative_pos.unsqueeze(1)
+ # bxhxqxk
+ elif relative_pos.dim() != 4:
+ raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
+
+ att_span = compute_attention_span(query_layer, key_layer, self.max_relative_positions)
+ relative_pos = relative_pos.long()
+ rel_embeddings = rel_embeddings[
+ self.max_relative_positions - att_span : self.max_relative_positions + att_span, :
+ ].unsqueeze(0)
+
+ score = 0
+
+ # content->position
+ if "c2p" in self.pos_att_type:
+ pos_key_layer = self.pos_proj(rel_embeddings)
+ pos_key_layer = self.transpose_for_scores(pos_key_layer)
+ c2p_att = torch.matmul(query_layer, pos_key_layer.transpose(-1, -2))
+ c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
+ c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_dynamic_expand(c2p_pos, query_layer, relative_pos))
+ score += c2p_att
+
+ # position->content
+ if "p2c" in self.pos_att_type:
+ pos_query_layer = self.pos_q_proj(rel_embeddings)
+ pos_query_layer = self.transpose_for_scores(pos_query_layer)
+ pos_query_layer /= scaled_size_sqrt(pos_query_layer, scale_factor)
+ r_pos = build_rpos(
+ query_layer,
+ key_layer,
+ relative_pos,
+ )
+ p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1)
+ p2c_att = torch.matmul(key_layer, pos_query_layer.transpose(-1, -2).to(dtype=key_layer.dtype))
+ p2c_att = torch.gather(
+ p2c_att, dim=-1, index=p2c_dynamic_expand(p2c_pos, query_layer, key_layer)
+ ).transpose(-1, -2)
+
+ p2c_att = uneven_size_corrected(p2c_att, query_layer, key_layer, relative_pos)
+ score += p2c_att
+
+ return score
+
+
+class DebertaEmbeddings(nn.Module):
+ """Construct the embeddings from word, position and token_type embeddings."""
+
+ def __init__(self, config):
+ super().__init__()
+ pad_token_id = getattr(config, "pad_token_id", 0)
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+ self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
+
+ self.position_biased_input = getattr(config, "position_biased_input", True)
+ if not self.position_biased_input:
+ self.position_embeddings = None
+ else:
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
+
+ if config.type_vocab_size > 0:
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
+ else:
+ self.token_type_embeddings = None
+
+ if self.embedding_size != config.hidden_size:
+ self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
+ else:
+ self.embed_proj = None
+
+ self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.config = config
+
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+ self.register_buffer(
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+ )
+
+ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
+ if input_ids is not None:
+ input_shape = input_ids.size()
+ else:
+ input_shape = inputs_embeds.size()[:-1]
+
+ seq_length = input_shape[1]
+
+ if position_ids is None:
+ position_ids = self.position_ids[:, :seq_length]
+
+ if token_type_ids is None:
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.word_embeddings(input_ids)
+
+ if self.position_embeddings is not None:
+ position_embeddings = self.position_embeddings(position_ids.long())
+ else:
+ position_embeddings = torch.zeros_like(inputs_embeds)
+
+ embeddings = inputs_embeds
+ if self.position_biased_input:
+ embeddings = embeddings + position_embeddings
+ if self.token_type_embeddings is not None:
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
+ embeddings = embeddings + token_type_embeddings
+
+ if self.embed_proj is not None:
+ embeddings = self.embed_proj(embeddings)
+
+ embeddings = self.LayerNorm(embeddings)
+
+ if mask is not None:
+ if mask.dim() != embeddings.dim():
+ if mask.dim() == 4:
+ mask = mask.squeeze(1).squeeze(1)
+ mask = mask.unsqueeze(2)
+ mask = mask.to(embeddings.dtype)
+
+ embeddings = embeddings * mask
+
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class DebertaAttention(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.self = DisentangledSelfAttention(config)
+ self.output = DebertaSelfOutput(config)
+ self.config = config
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask,
+ output_attentions: bool = False,
+ query_states=None,
+ relative_pos=None,
+ rel_embeddings=None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+ self_output, att_matrix = self.self(
+ hidden_states,
+ attention_mask,
+ output_attentions,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ )
+ if query_states is None:
+ query_states = hidden_states
+ attention_output = self.output(self_output, query_states)
+
+ if output_attentions:
+ return (attention_output, att_matrix)
+ else:
+ return (attention_output, None)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Deberta
+class DebertaIntermediate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+class DebertaOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.config = config
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class DebertaLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.attention = DebertaAttention(config)
+ self.intermediate = DebertaIntermediate(config)
+ self.output = DebertaOutput(config)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask,
+ query_states=None,
+ relative_pos=None,
+ rel_embeddings=None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+ attention_output, att_matrix = self.attention(
+ hidden_states,
+ attention_mask,
+ output_attentions=output_attentions,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ )
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+
+ if output_attentions:
+ return (layer_output, att_matrix)
+ else:
+ return (layer_output, None)
+
+
+class DebertaEncoder(nn.Module):
+ """Modified BertEncoder with relative position bias support"""
+
+ def __init__(self, config):
+ super().__init__()
+ self.layer = nn.ModuleList([DebertaLayer(config) for _ in range(config.num_hidden_layers)])
+ self.relative_attention = getattr(config, "relative_attention", False)
+ if self.relative_attention:
+ self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+ if self.max_relative_positions < 1:
+ self.max_relative_positions = config.max_position_embeddings
+ self.rel_embeddings = nn.Embedding(self.max_relative_positions * 2, config.hidden_size)
+ self.gradient_checkpointing = False
+
+ def get_rel_embedding(self):
+ rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+ return rel_embeddings
+
+ def get_attention_mask(self, attention_mask):
+ if attention_mask.dim() <= 2:
+ extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+ attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
+ elif attention_mask.dim() == 3:
+ attention_mask = attention_mask.unsqueeze(1)
+
+ return attention_mask
+
+ def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+ if self.relative_attention and relative_pos is None:
+ if query_states is not None:
+ relative_pos = build_relative_position(query_states, hidden_states)
+ else:
+ relative_pos = build_relative_position(hidden_states, hidden_states)
+ return relative_pos
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ output_hidden_states: bool = True,
+ output_attentions: bool = False,
+ query_states=None,
+ relative_pos=None,
+ return_dict: bool = True,
+ ):
+ attention_mask = self.get_attention_mask(attention_mask)
+ relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+
+ all_hidden_states: Optional[Tuple[torch.Tensor]] = (hidden_states,) if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ next_kv = hidden_states
+
+ rel_embeddings = self.get_rel_embedding()
+ for i, layer_module in enumerate(self.layer):
+ if self.gradient_checkpointing and self.training:
+ hidden_states, att_m = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ next_kv,
+ attention_mask,
+ query_states,
+ relative_pos,
+ rel_embeddings,
+ output_attentions,
+ )
+ else:
+ hidden_states, att_m = layer_module(
+ next_kv,
+ attention_mask,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ output_attentions=output_attentions,
+ )
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if query_states is not None:
+ query_states = hidden_states
+ else:
+ next_kv = hidden_states
+
+ if output_attentions:
+ all_attentions = all_attentions + (att_m,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+ )
+
+
+class DebertaPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = DebertaConfig
+ base_model_prefix = "deberta"
+ _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
+ supports_gradient_checkpointing = True
+
+ def _init_weights(self, module):
+ """Initialize the weights."""
+ if isinstance(module, nn.Linear):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, (nn.LayerNorm, DebertaLayerNorm)):
+ module.weight.data.fill_(1.0)
+ module.bias.data.zero_()
+ elif isinstance(module, DisentangledSelfAttention):
+ module.q_bias.data.zero_()
+ module.v_bias.data.zero_()
+ elif isinstance(module, (LegacyDebertaLMPredictionHead, DebertaLMPredictionHead)):
+ module.bias.data.zero_()
+
+
+DEBERTA_START_DOCSTRING = r"""
+ The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+ Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+ on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+ improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+
+ Parameters:
+ config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEBERTA_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `({0})`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
+
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
+
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+ DEBERTA_START_DOCSTRING,
+)
+class DebertaModel(DebertaPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.embeddings = DebertaEmbeddings(config)
+ self.encoder = DebertaEncoder(config)
+ self.z_steps = 0
+ self.config = config
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embeddings.word_embeddings
+
+ def set_input_embeddings(self, new_embeddings):
+ self.embeddings.word_embeddings = new_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ raise NotImplementedError("The prune function is not implemented in DeBERTa model.")
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+ input_shape = input_ids.size()
+ elif inputs_embeds is not None:
+ input_shape = inputs_embeds.size()[:-1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+ if attention_mask is None:
+ attention_mask = torch.ones(input_shape, device=device)
+ if token_type_ids is None:
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ )
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ attention_mask,
+ output_hidden_states=True,
+ output_attentions=output_attentions,
+ return_dict=return_dict,
+ )
+ encoded_layers = encoder_outputs[1]
+
+ if self.z_steps > 1:
+ hidden_states = encoded_layers[-2]
+ layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
+ query_states = encoded_layers[-1]
+ rel_embeddings = self.encoder.get_rel_embedding()
+ attention_mask = self.encoder.get_attention_mask(attention_mask)
+ rel_pos = self.encoder.get_rel_pos(embedding_output)
+ for layer in layers[1:]:
+ query_states = layer(
+ hidden_states,
+ attention_mask,
+ output_attentions=False,
+ query_states=query_states,
+ relative_pos=rel_pos,
+ rel_embeddings=rel_embeddings,
+ )
+ encoded_layers.append(query_states)
+
+ sequence_output = encoded_layers[-1]
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :]
+
+ return BaseModelOutput(
+ last_hidden_state=sequence_output,
+ hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+class LegacyDebertaPredictionHeadTransform(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+ self.dense = nn.Linear(config.hidden_size, self.embedding_size)
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.transform_act_fn = config.hidden_act
+ self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+ return hidden_states
+
+
+class LegacyDebertaLMPredictionHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.transform = LegacyDebertaPredictionHeadTransform(config)
+
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)
+
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+ # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+ self.decoder.bias = self.bias
+
+ def _tie_weights(self):
+ self.decoder.bias = self.bias
+
+ def forward(self, hidden_states):
+ hidden_states = self.transform(hidden_states)
+ hidden_states = self.decoder(hidden_states)
+ return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->LegacyDeberta
+class LegacyDebertaOnlyMLMHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.predictions = LegacyDebertaLMPredictionHead(config)
+
+ def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+ prediction_scores = self.predictions(sequence_output)
+ return prediction_scores
+
+
+class DebertaLMPredictionHead(nn.Module):
+ """https://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270"""
+
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.transform_act_fn = config.hidden_act
+
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=True)
+
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+ # note that the input embeddings must be passed as an argument
+ def forward(self, hidden_states, word_embeddings):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(
+ hidden_states
+ ) # original used MaskedLayerNorm, but passed no mask. This is equivalent.
+ hidden_states = torch.matmul(hidden_states, word_embeddings.weight.t()) + self.bias
+ return hidden_states
+
+
+class DebertaOnlyMLMHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.lm_head = DebertaLMPredictionHead(config)
+
+ # note that the input embeddings must be passed as an argument
+ def forward(self, sequence_output, word_embeddings):
+ prediction_scores = self.lm_head(sequence_output, word_embeddings)
+ return prediction_scores
+
+
+@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
+class DebertaForMaskedLM(DebertaPreTrainedModel):
+ _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.legacy = config.legacy
+ self.deberta = DebertaModel(config)
+ if self.legacy:
+ self.cls = LegacyDebertaOnlyMLMHead(config)
+ else:
+ self._tied_weights_keys = ["lm_predictions.lm_head.weight", "deberta.embeddings.word_embeddings.weight"]
+ self.lm_predictions = DebertaOnlyMLMHead(config)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_output_embeddings(self):
+ if self.legacy:
+ return self.cls.predictions.decoder
+ else:
+ return self.lm_predictions.lm_head.dense
+
+ def set_output_embeddings(self, new_embeddings):
+ if self.legacy:
+ self.cls.predictions.decoder = new_embeddings
+ self.cls.predictions.bias = new_embeddings.bias
+ else:
+ self.lm_predictions.lm_head.dense = new_embeddings
+ self.lm_predictions.lm_head.bias = new_embeddings.bias
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_MASKED_LM,
+ output_type=MaskedLMOutput,
+ config_class=_CONFIG_FOR_DOC,
+ mask="[MASK]",
+ expected_output=_MASKED_LM_EXPECTED_OUTPUT,
+ expected_loss=_MASKED_LM_EXPECTED_LOSS,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, MaskedLMOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ """
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deberta(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+ if self.legacy:
+ prediction_scores = self.cls(sequence_output)
+ else:
+ prediction_scores = self.lm_predictions(sequence_output, self.deberta.embeddings.word_embeddings)
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[1:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return MaskedLMOutput(
+ loss=masked_lm_loss,
+ logits=prediction_scores,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+class ContextPooler(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
+ self.dropout = nn.Dropout(config.pooler_dropout)
+ self.config = config
+
+ def forward(self, hidden_states):
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+
+ context_token = hidden_states[:, 0]
+ context_token = self.dropout(context_token)
+ pooled_output = self.dense(context_token)
+ pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
+ return pooled_output
+
+ @property
+ def output_dim(self):
+ return self.config.hidden_size
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+ pooled output) e.g. for GLUE tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class DebertaForSequenceClassification(DebertaPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ num_labels = getattr(config, "num_labels", 2)
+ self.num_labels = num_labels
+
+ self.deberta = DebertaModel(config)
+ self.pooler = ContextPooler(config)
+ output_dim = self.pooler.output_dim
+
+ self.classifier = nn.Linear(output_dim, num_labels)
+ drop_out = getattr(config, "cls_dropout", None)
+ drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+ self.dropout = nn.Dropout(drop_out)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.deberta.get_input_embeddings()
+
+ def set_input_embeddings(self, new_embeddings):
+ self.deberta.set_input_embeddings(new_embeddings)
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=SequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deberta(
+ input_ids,
+ token_type_ids=token_type_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ encoder_layer = outputs[0]
+ pooled_output = self.pooler(encoder_layer)
+ pooled_output = self.dropout(pooled_output)
+ logits = self.classifier(pooled_output)
+
+ loss = None
+ if labels is not None:
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ # regression task
+ loss_fn = nn.MSELoss()
+ logits = logits.view(-1).to(labels.dtype)
+ loss = loss_fn(logits, labels.view(-1))
+ elif labels.dim() == 1 or labels.size(-1) == 1:
+ label_index = (labels >= 0).nonzero()
+ labels = labels.long()
+ if label_index.size(0) > 0:
+ labeled_logits = torch.gather(
+ logits, 0, label_index.expand(label_index.size(0), logits.size(1))
+ )
+ labels = torch.gather(labels, 0, label_index.view(-1))
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1))
+ else:
+ loss = torch.tensor(0).to(logits)
+ else:
+ log_softmax = nn.LogSoftmax(-1)
+ loss = -((log_softmax(logits) * labels).sum(-1)).mean()
+ elif self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutput(
+ loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+ )
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+ Named-Entity-Recognition (NER) tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class DebertaForTokenClassification(DebertaPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+
+ self.deberta = DebertaModel(config)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TokenClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, TokenClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deberta(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+
+ sequence_output = self.dropout(sequence_output)
+ logits = self.classifier(sequence_output)
+
+ loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TokenClassifierOutput(
+ loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+ )
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+ layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class DebertaForQuestionAnswering(DebertaPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+
+ self.deberta = DebertaModel(config)
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_QA,
+ output_type=QuestionAnsweringModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_QA_EXPECTED_OUTPUT,
+ expected_loss=_QA_EXPECTED_LOSS,
+ qa_target_start_index=_QA_TARGET_START_INDEX,
+ qa_target_end_index=_QA_TARGET_END_INDEX,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ start_positions: Optional[torch.Tensor] = None,
+ end_positions: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+ r"""
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deberta(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+
+ logits = self.qa_outputs(sequence_output)
+ start_logits, end_logits = logits.split(1, dim=-1)
+ start_logits = start_logits.squeeze(-1).contiguous()
+ end_logits = end_logits.squeeze(-1).contiguous()
+
+ total_loss = None
+ if start_positions is not None and end_positions is not None:
+ # If we are on multi-GPU, split add a dimension
+ if len(start_positions.size()) > 1:
+ start_positions = start_positions.squeeze(-1)
+ if len(end_positions.size()) > 1:
+ end_positions = end_positions.squeeze(-1)
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
+ ignored_index = start_logits.size(1)
+ start_positions = start_positions.clamp(0, ignored_index)
+ end_positions = end_positions.clamp(0, ignored_index)
+
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+ start_loss = loss_fct(start_logits, start_positions)
+ end_loss = loss_fct(end_logits, end_positions)
+ total_loss = (start_loss + end_loss) / 2
+
+ if not return_dict:
+ output = (start_logits, end_logits) + outputs[1:]
+ return ((total_loss,) + output) if total_loss is not None else output
+
+ return QuestionAnsweringModelOutput(
+ loss=total_loss,
+ start_logits=start_logits,
+ end_logits=end_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = [
+ "DebertaForMaskedLM",
+ "DebertaForQuestionAnswering",
+ "DebertaForSequenceClassification",
+ "DebertaForTokenClassification",
+ "DebertaModel",
+ "DebertaPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/deberta/modeling_tf_deberta.py b/docs/transformers/build/lib/transformers/models/deberta/modeling_tf_deberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..cad151711323a2eac33f3858bccef0ce285afc9a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta/modeling_tf_deberta.py
@@ -0,0 +1,1652 @@
+# coding=utf-8
+# Copyright 2021 Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 DeBERTa model."""
+
+from __future__ import annotations
+
+import math
+from typing import Dict, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+ TFBaseModelOutput,
+ TFMaskedLMOutput,
+ TFQuestionAnsweringModelOutput,
+ TFSequenceClassifierOutput,
+ TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+ TFMaskedLanguageModelingLoss,
+ TFModelInputType,
+ TFPreTrainedModel,
+ TFQuestionAnsweringLoss,
+ TFSequenceClassificationLoss,
+ TFTokenClassificationLoss,
+ get_initializer,
+ keras,
+ unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_deberta import DebertaConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "DebertaConfig"
+_CHECKPOINT_FOR_DOC = "kamalkraj/deberta-base"
+
+
+class TFDebertaContextPooler(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
+ self.dropout = TFDebertaStableDropout(config.pooler_dropout, name="dropout")
+ self.config = config
+
+ def call(self, hidden_states, training: bool = False):
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ context_token = hidden_states[:, 0]
+ context_token = self.dropout(context_token, training=training)
+ pooled_output = self.dense(context_token)
+ pooled_output = get_tf_activation(self.config.pooler_hidden_act)(pooled_output)
+ return pooled_output
+
+ @property
+ def output_dim(self) -> int:
+ return self.config.hidden_size
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.pooler_hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+
+class TFDebertaXSoftmax(keras.layers.Layer):
+ """
+ Masked Softmax which is optimized for saving memory
+
+ Args:
+ input (`tf.Tensor`): The input tensor that will apply softmax.
+ mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+ dim (int): The dimension that will apply softmax
+ """
+
+ def __init__(self, axis=-1, **kwargs):
+ super().__init__(**kwargs)
+ self.axis = axis
+
+ def call(self, inputs: tf.Tensor, mask: tf.Tensor):
+ rmask = tf.logical_not(tf.cast(mask, tf.bool))
+ output = tf.where(rmask, tf.cast(float("-inf"), dtype=self.compute_dtype), inputs)
+ output = stable_softmax(tf.cast(output, dtype=tf.float32), self.axis)
+ output = tf.where(rmask, 0.0, output)
+ return output
+
+
+class TFDebertaStableDropout(keras.layers.Layer):
+ """
+ Optimized dropout module for stabilizing the training
+
+ Args:
+ drop_prob (float): the dropout probabilities
+ """
+
+ def __init__(self, drop_prob, **kwargs):
+ super().__init__(**kwargs)
+ self.drop_prob = drop_prob
+
+ @tf.custom_gradient
+ def xdropout(self, inputs):
+ """
+ Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
+ """
+ mask = tf.cast(
+ 1
+ - tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)),
+ tf.bool,
+ )
+ scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=self.compute_dtype)
+ if self.drop_prob > 0:
+ inputs = tf.where(mask, tf.cast(0.0, dtype=self.compute_dtype), inputs) * scale
+
+ def grad(upstream):
+ if self.drop_prob > 0:
+ return tf.where(mask, tf.cast(0.0, dtype=self.compute_dtype), upstream) * scale
+ else:
+ return upstream
+
+ return inputs, grad
+
+ def call(self, inputs: tf.Tensor, training: tf.Tensor = False):
+ if training:
+ return self.xdropout(inputs)
+ return inputs
+
+
+class TFDebertaLayerNorm(keras.layers.Layer):
+ """LayerNorm module in the TF style (epsilon inside the square root)."""
+
+ def __init__(self, size, eps=1e-12, **kwargs):
+ super().__init__(**kwargs)
+ self.size = size
+ self.eps = eps
+
+ def build(self, input_shape):
+ self.gamma = self.add_weight(shape=[self.size], initializer=tf.ones_initializer(), name="weight")
+ self.beta = self.add_weight(shape=[self.size], initializer=tf.zeros_initializer(), name="bias")
+ return super().build(input_shape)
+
+ def call(self, x: tf.Tensor) -> tf.Tensor:
+ mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
+ variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
+ std = tf.math.sqrt(variance + self.eps)
+ return self.gamma * (x - mean) / std + self.beta
+
+
+class TFDebertaSelfOutput(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(config.hidden_size, name="dense")
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
+ self.config = config
+
+ def call(self, hidden_states, input_tensor, training: bool = False):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states, training=training)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, None, self.config.hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+
+class TFDebertaAttention(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.self = TFDebertaDisentangledSelfAttention(config, name="self")
+ self.dense_output = TFDebertaSelfOutput(config, name="output")
+ self.config = config
+
+ def call(
+ self,
+ input_tensor: tf.Tensor,
+ attention_mask: tf.Tensor,
+ query_states: Optional[tf.Tensor] = None,
+ relative_pos: Optional[tf.Tensor] = None,
+ rel_embeddings: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ self_outputs = self.self(
+ hidden_states=input_tensor,
+ attention_mask=attention_mask,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ if query_states is None:
+ query_states = input_tensor
+ attention_output = self.dense_output(
+ hidden_states=self_outputs[0], input_tensor=query_states, training=training
+ )
+
+ output = (attention_output,) + self_outputs[1:]
+
+ return output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "self", None) is not None:
+ with tf.name_scope(self.self.name):
+ self.self.build(None)
+ if getattr(self, "dense_output", None) is not None:
+ with tf.name_scope(self.dense_output.name):
+ self.dense_output.build(None)
+
+
+class TFDebertaIntermediate(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+ else:
+ self.intermediate_act_fn = config.hidden_act
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFDebertaOutput(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(hidden_states, training=training)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.intermediate_size])
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, None, self.config.hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+
+class TFDebertaLayer(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.attention = TFDebertaAttention(config, name="attention")
+ self.intermediate = TFDebertaIntermediate(config, name="intermediate")
+ self.bert_output = TFDebertaOutput(config, name="output")
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ query_states: Optional[tf.Tensor] = None,
+ relative_pos: Optional[tf.Tensor] = None,
+ rel_embeddings: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ attention_outputs = self.attention(
+ input_tensor=hidden_states,
+ attention_mask=attention_mask,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ attention_output = attention_outputs[0]
+ intermediate_output = self.intermediate(hidden_states=attention_output)
+ layer_output = self.bert_output(
+ hidden_states=intermediate_output, input_tensor=attention_output, training=training
+ )
+ outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "intermediate", None) is not None:
+ with tf.name_scope(self.intermediate.name):
+ self.intermediate.build(None)
+ if getattr(self, "bert_output", None) is not None:
+ with tf.name_scope(self.bert_output.name):
+ self.bert_output.build(None)
+
+
+class TFDebertaEncoder(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.layer = [TFDebertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+ self.relative_attention = getattr(config, "relative_attention", False)
+ self.config = config
+ if self.relative_attention:
+ self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+ if self.max_relative_positions < 1:
+ self.max_relative_positions = config.max_position_embeddings
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if self.relative_attention:
+ self.rel_embeddings = self.add_weight(
+ name="rel_embeddings.weight",
+ shape=[self.max_relative_positions * 2, self.config.hidden_size],
+ initializer=get_initializer(self.config.initializer_range),
+ )
+ if getattr(self, "layer", None) is not None:
+ for layer in self.layer:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+ def get_rel_embedding(self):
+ rel_embeddings = self.rel_embeddings if self.relative_attention else None
+ return rel_embeddings
+
+ def get_attention_mask(self, attention_mask):
+ if len(shape_list(attention_mask)) <= 2:
+ extended_attention_mask = tf.expand_dims(tf.expand_dims(attention_mask, 1), 2)
+ attention_mask = extended_attention_mask * tf.expand_dims(tf.squeeze(extended_attention_mask, -2), -1)
+ attention_mask = tf.cast(attention_mask, tf.uint8)
+ elif len(shape_list(attention_mask)) == 3:
+ attention_mask = tf.expand_dims(attention_mask, 1)
+
+ return attention_mask
+
+ def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+ if self.relative_attention and relative_pos is None:
+ q = shape_list(query_states)[-2] if query_states is not None else shape_list(hidden_states)[-2]
+ relative_pos = build_relative_position(q, shape_list(hidden_states)[-2])
+ return relative_pos
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ query_states: Optional[tf.Tensor] = None,
+ relative_pos: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+ all_hidden_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ attention_mask = self.get_attention_mask(attention_mask)
+ relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+
+ if isinstance(hidden_states, Sequence):
+ next_kv = hidden_states[0]
+ else:
+ next_kv = hidden_states
+
+ rel_embeddings = self.get_rel_embedding()
+
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_outputs = layer_module(
+ hidden_states=next_kv,
+ attention_mask=attention_mask,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ hidden_states = layer_outputs[0]
+
+ if query_states is not None:
+ query_states = hidden_states
+ if isinstance(hidden_states, Sequence):
+ next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
+ else:
+ next_kv = hidden_states
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ # Add last layer
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+
+ return TFBaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+ )
+
+
+def build_relative_position(query_size, key_size):
+ """
+ Build relative position according to the query and key
+
+ We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+ \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+ P_k\\)
+
+ Args:
+ query_size (int): the length of query
+ key_size (int): the length of key
+
+ Return:
+ `tf.Tensor`: A tensor with shape [1, query_size, key_size]
+
+ """
+ q_ids = tf.range(query_size, dtype=tf.int32)
+ k_ids = tf.range(key_size, dtype=tf.int32)
+ rel_pos_ids = q_ids[:, None] - tf.tile(tf.reshape(k_ids, [1, -1]), [query_size, 1])
+ rel_pos_ids = rel_pos_ids[:query_size, :]
+ rel_pos_ids = tf.expand_dims(rel_pos_ids, axis=0)
+ return tf.cast(rel_pos_ids, tf.int64)
+
+
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+ shapes = [
+ shape_list(query_layer)[0],
+ shape_list(query_layer)[1],
+ shape_list(query_layer)[2],
+ shape_list(relative_pos)[-1],
+ ]
+ return tf.broadcast_to(c2p_pos, shapes)
+
+
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+ shapes = [
+ shape_list(query_layer)[0],
+ shape_list(query_layer)[1],
+ shape_list(key_layer)[-2],
+ shape_list(key_layer)[-2],
+ ]
+ return tf.broadcast_to(c2p_pos, shapes)
+
+
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+ shapes = shape_list(p2c_att)[:2] + [shape_list(pos_index)[-2], shape_list(key_layer)[-2]]
+ return tf.broadcast_to(pos_index, shapes)
+
+
+def torch_gather(x, indices, gather_axis):
+ if gather_axis < 0:
+ gather_axis = tf.rank(x) + gather_axis
+
+ if gather_axis != tf.rank(x) - 1:
+ pre_roll = tf.rank(x) - 1 - gather_axis
+ permutation = tf.roll(tf.range(tf.rank(x)), pre_roll, axis=0)
+ x = tf.transpose(x, perm=permutation)
+ indices = tf.transpose(indices, perm=permutation)
+ else:
+ pre_roll = 0
+
+ flat_x = tf.reshape(x, (-1, tf.shape(x)[-1]))
+ flat_indices = tf.reshape(indices, (-1, tf.shape(indices)[-1]))
+ gathered = tf.gather(flat_x, flat_indices, batch_dims=1)
+ gathered = tf.reshape(gathered, tf.shape(indices))
+
+ if pre_roll != 0:
+ permutation = tf.roll(tf.range(tf.rank(x)), -pre_roll, axis=0)
+ gathered = tf.transpose(gathered, perm=permutation)
+
+ return gathered
+
+
+class TFDebertaDisentangledSelfAttention(keras.layers.Layer):
+ """
+ Disentangled self-attention module
+
+ Parameters:
+ config (`str`):
+ A model config class instance with the configuration to build a new model. The schema is similar to
+ *BertConfig*, for more details, please refer [`DebertaConfig`]
+
+ """
+
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+ if config.hidden_size % config.num_attention_heads != 0:
+ raise ValueError(
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+ f"heads ({config.num_attention_heads})"
+ )
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.in_proj = keras.layers.Dense(
+ self.all_head_size * 3,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="in_proj",
+ use_bias=False,
+ )
+ self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+
+ self.relative_attention = getattr(config, "relative_attention", False)
+ self.talking_head = getattr(config, "talking_head", False)
+
+ if self.talking_head:
+ self.head_logits_proj = keras.layers.Dense(
+ self.num_attention_heads,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="head_logits_proj",
+ use_bias=False,
+ )
+ self.head_weights_proj = keras.layers.Dense(
+ self.num_attention_heads,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="head_weights_proj",
+ use_bias=False,
+ )
+
+ self.softmax = TFDebertaXSoftmax(axis=-1)
+
+ if self.relative_attention:
+ self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+ if self.max_relative_positions < 1:
+ self.max_relative_positions = config.max_position_embeddings
+ self.pos_dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="pos_dropout")
+ if "c2p" in self.pos_att_type:
+ self.pos_proj = keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="pos_proj",
+ use_bias=False,
+ )
+ if "p2c" in self.pos_att_type:
+ self.pos_q_proj = keras.layers.Dense(
+ self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj"
+ )
+
+ self.dropout = TFDebertaStableDropout(config.attention_probs_dropout_prob, name="dropout")
+ self.config = config
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ self.q_bias = self.add_weight(
+ name="q_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
+ )
+ self.v_bias = self.add_weight(
+ name="v_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
+ )
+ if getattr(self, "in_proj", None) is not None:
+ with tf.name_scope(self.in_proj.name):
+ self.in_proj.build([None, None, self.config.hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+ if getattr(self, "head_logits_proj", None) is not None:
+ with tf.name_scope(self.head_logits_proj.name):
+ self.head_logits_proj.build(None)
+ if getattr(self, "head_weights_proj", None) is not None:
+ with tf.name_scope(self.head_weights_proj.name):
+ self.head_weights_proj.build(None)
+ if getattr(self, "pos_dropout", None) is not None:
+ with tf.name_scope(self.pos_dropout.name):
+ self.pos_dropout.build(None)
+ if getattr(self, "pos_proj", None) is not None:
+ with tf.name_scope(self.pos_proj.name):
+ self.pos_proj.build([self.config.hidden_size])
+ if getattr(self, "pos_q_proj", None) is not None:
+ with tf.name_scope(self.pos_q_proj.name):
+ self.pos_q_proj.build([self.config.hidden_size])
+
+ def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor:
+ shape = shape_list(tensor)[:-1] + [self.num_attention_heads, -1]
+ # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+ tensor = tf.reshape(tensor=tensor, shape=shape)
+
+ # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+ return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ query_states: Optional[tf.Tensor] = None,
+ relative_pos: Optional[tf.Tensor] = None,
+ rel_embeddings: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ """
+ Call the module
+
+ Args:
+ hidden_states (`tf.Tensor`):
+ Input states to the module usually the output from previous layer, it will be the Q,K and V in
+ *Attention(Q,K,V)*
+
+ attention_mask (`tf.Tensor`):
+ An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+ sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
+ th token.
+
+ return_att (`bool`, *optional*):
+ Whether return the attention matrix.
+
+ query_states (`tf.Tensor`, *optional*):
+ The *Q* state in *Attention(Q,K,V)*.
+
+ relative_pos (`tf.Tensor`):
+ The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+ values ranging in [*-max_relative_positions*, *max_relative_positions*].
+
+ rel_embeddings (`tf.Tensor`):
+ The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+ \\text{max_relative_positions}\\), *hidden_size*].
+
+
+ """
+ if query_states is None:
+ qp = self.in_proj(hidden_states) # .split(self.all_head_size, dim=-1)
+ query_layer, key_layer, value_layer = tf.split(
+ self.transpose_for_scores(qp), num_or_size_splits=3, axis=-1
+ )
+ else:
+
+ def linear(w, b, x):
+ out = tf.matmul(x, w, transpose_b=True)
+ if b is not None:
+ out += tf.transpose(b)
+ return out
+
+ ws = tf.split(
+ tf.transpose(self.in_proj.weight[0]), num_or_size_splits=self.num_attention_heads * 3, axis=0
+ )
+ qkvw = tf.TensorArray(dtype=self.dtype, size=3)
+ for k in tf.range(3):
+ qkvw_inside = tf.TensorArray(dtype=self.dtype, size=self.num_attention_heads)
+ for i in tf.range(self.num_attention_heads):
+ qkvw_inside = qkvw_inside.write(i, ws[i * 3 + k])
+ qkvw = qkvw.write(k, qkvw_inside.concat())
+ qkvb = [None] * 3
+
+ q = linear(qkvw[0], qkvb[0], query_states)
+ k = linear(qkvw[1], qkvb[1], hidden_states)
+ v = linear(qkvw[2], qkvb[2], hidden_states)
+ query_layer = self.transpose_for_scores(q)
+ key_layer = self.transpose_for_scores(k)
+ value_layer = self.transpose_for_scores(v)
+
+ query_layer = query_layer + self.transpose_for_scores(self.q_bias[None, None, :])
+ value_layer = value_layer + self.transpose_for_scores(self.v_bias[None, None, :])
+
+ rel_att = None
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ scale_factor = 1 + len(self.pos_att_type)
+ scale = math.sqrt(shape_list(query_layer)[-1] * scale_factor)
+ query_layer = query_layer / scale
+
+ attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 1, 3, 2]))
+ if self.relative_attention:
+ rel_embeddings = self.pos_dropout(rel_embeddings, training=training)
+ rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
+
+ if rel_att is not None:
+ attention_scores = attention_scores + rel_att
+
+ if self.talking_head:
+ attention_scores = tf.transpose(
+ self.head_logits_proj(tf.transpose(attention_scores, [0, 2, 3, 1])), [0, 3, 1, 2]
+ )
+
+ attention_probs = self.softmax(attention_scores, attention_mask)
+ attention_probs = self.dropout(attention_probs, training=training)
+ if self.talking_head:
+ attention_probs = tf.transpose(
+ self.head_weights_proj(tf.transpose(attention_probs, [0, 2, 3, 1])), [0, 3, 1, 2]
+ )
+
+ context_layer = tf.matmul(attention_probs, value_layer)
+ context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
+ context_layer_shape = shape_list(context_layer)
+ # Set the final dimension here explicitly.
+ # Calling tf.reshape(context_layer, (*context_layer_shape[:-2], -1)) raises an error when executing
+ # the model in graph mode as context_layer is reshaped to (None, 7, None) and Dense layer in TFDebertaV2SelfOutput
+ # requires final input dimension to be defined
+ new_context_layer_shape = context_layer_shape[:-2] + [context_layer_shape[-2] * context_layer_shape[-1]]
+ context_layer = tf.reshape(context_layer, new_context_layer_shape)
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+ return outputs
+
+ def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
+ if relative_pos is None:
+ q = shape_list(query_layer)[-2]
+ relative_pos = build_relative_position(q, shape_list(key_layer)[-2])
+ shape_list_pos = shape_list(relative_pos)
+ if len(shape_list_pos) == 2:
+ relative_pos = tf.expand_dims(tf.expand_dims(relative_pos, 0), 0)
+ elif len(shape_list_pos) == 3:
+ relative_pos = tf.expand_dims(relative_pos, 1)
+ # bxhxqxk
+ elif len(shape_list_pos) != 4:
+ raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {len(shape_list_pos)}")
+
+ att_span = tf.cast(
+ tf.minimum(
+ tf.maximum(shape_list(query_layer)[-2], shape_list(key_layer)[-2]), self.max_relative_positions
+ ),
+ tf.int64,
+ )
+ rel_embeddings = tf.expand_dims(
+ rel_embeddings[self.max_relative_positions - att_span : self.max_relative_positions + att_span, :], 0
+ )
+
+ score = 0
+
+ # content->position
+ if "c2p" in self.pos_att_type:
+ pos_key_layer = self.pos_proj(rel_embeddings)
+ pos_key_layer = self.transpose_for_scores(pos_key_layer)
+ c2p_att = tf.matmul(query_layer, tf.transpose(pos_key_layer, [0, 1, 3, 2]))
+ c2p_pos = tf.clip_by_value(relative_pos + att_span, 0, att_span * 2 - 1)
+ c2p_att = torch_gather(c2p_att, c2p_dynamic_expand(c2p_pos, query_layer, relative_pos), -1)
+ score += c2p_att
+
+ # position->content
+ if "p2c" in self.pos_att_type:
+ pos_query_layer = self.pos_q_proj(rel_embeddings)
+ pos_query_layer = self.transpose_for_scores(pos_query_layer)
+ pos_query_layer /= tf.math.sqrt(
+ tf.cast(shape_list(pos_query_layer)[-1] * scale_factor, dtype=self.compute_dtype)
+ )
+ if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]:
+ r_pos = build_relative_position(shape_list(key_layer)[-2], shape_list(key_layer)[-2])
+ else:
+ r_pos = relative_pos
+ p2c_pos = tf.clip_by_value(-r_pos + att_span, 0, att_span * 2 - 1)
+ p2c_att = tf.matmul(key_layer, tf.transpose(pos_query_layer, [0, 1, 3, 2]))
+ p2c_att = tf.transpose(
+ torch_gather(p2c_att, p2c_dynamic_expand(p2c_pos, query_layer, key_layer), -1), [0, 1, 3, 2]
+ )
+ if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]:
+ pos_index = tf.expand_dims(relative_pos[:, :, :, 0], -1)
+ p2c_att = torch_gather(p2c_att, pos_dynamic_expand(pos_index, p2c_att, key_layer), -2)
+ score += p2c_att
+
+ return score
+
+
+class TFDebertaEmbeddings(keras.layers.Layer):
+ """Construct the embeddings from word, position and token_type embeddings."""
+
+ def __init__(self, config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.config = config
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+ self.hidden_size = config.hidden_size
+ self.max_position_embeddings = config.max_position_embeddings
+ self.position_biased_input = getattr(config, "position_biased_input", True)
+ self.initializer_range = config.initializer_range
+ if self.embedding_size != config.hidden_size:
+ self.embed_proj = keras.layers.Dense(
+ config.hidden_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="embed_proj",
+ use_bias=False,
+ )
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
+
+ def build(self, input_shape=None):
+ with tf.name_scope("word_embeddings"):
+ self.weight = self.add_weight(
+ name="weight",
+ shape=[self.config.vocab_size, self.embedding_size],
+ initializer=get_initializer(self.initializer_range),
+ )
+
+ with tf.name_scope("token_type_embeddings"):
+ if self.config.type_vocab_size > 0:
+ self.token_type_embeddings = self.add_weight(
+ name="embeddings",
+ shape=[self.config.type_vocab_size, self.embedding_size],
+ initializer=get_initializer(self.initializer_range),
+ )
+ else:
+ self.token_type_embeddings = None
+
+ with tf.name_scope("position_embeddings"):
+ if self.position_biased_input:
+ self.position_embeddings = self.add_weight(
+ name="embeddings",
+ shape=[self.max_position_embeddings, self.hidden_size],
+ initializer=get_initializer(self.initializer_range),
+ )
+ else:
+ self.position_embeddings = None
+
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, None, self.config.hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+ if getattr(self, "embed_proj", None) is not None:
+ with tf.name_scope(self.embed_proj.name):
+ self.embed_proj.build([None, None, self.embedding_size])
+
+ def call(
+ self,
+ input_ids: Optional[tf.Tensor] = None,
+ position_ids: Optional[tf.Tensor] = None,
+ token_type_ids: Optional[tf.Tensor] = None,
+ inputs_embeds: Optional[tf.Tensor] = None,
+ mask: Optional[tf.Tensor] = None,
+ training: bool = False,
+ ) -> tf.Tensor:
+ """
+ Applies embedding based on inputs tensor.
+
+ Returns:
+ final_embeddings (`tf.Tensor`): output embedding tensor.
+ """
+ if input_ids is None and inputs_embeds is None:
+ raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
+
+ if input_ids is not None:
+ check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+ inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+ input_shape = shape_list(inputs_embeds)[:-1]
+
+ if token_type_ids is None:
+ token_type_ids = tf.fill(dims=input_shape, value=0)
+
+ if position_ids is None:
+ position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+ final_embeddings = inputs_embeds
+ if self.position_biased_input:
+ position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+ final_embeddings += position_embeds
+ if self.config.type_vocab_size > 0:
+ token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+ final_embeddings += token_type_embeds
+
+ if self.embedding_size != self.hidden_size:
+ final_embeddings = self.embed_proj(final_embeddings)
+
+ final_embeddings = self.LayerNorm(final_embeddings)
+
+ if mask is not None:
+ if len(shape_list(mask)) != len(shape_list(final_embeddings)):
+ if len(shape_list(mask)) == 4:
+ mask = tf.squeeze(tf.squeeze(mask, axis=1), axis=1)
+ mask = tf.cast(tf.expand_dims(mask, axis=2), dtype=self.compute_dtype)
+
+ final_embeddings = final_embeddings * mask
+
+ final_embeddings = self.dropout(final_embeddings, training=training)
+
+ return final_embeddings
+
+
+class TFDebertaPredictionHeadTransform(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+ self.dense = keras.layers.Dense(
+ units=self.embedding_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="dense",
+ )
+
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = get_tf_activation(config.hidden_act)
+ else:
+ self.transform_act_fn = config.hidden_act
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, None, self.embedding_size])
+
+
+class TFDebertaLMPredictionHead(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
+ super().__init__(**kwargs)
+
+ self.config = config
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+ self.transform = TFDebertaPredictionHeadTransform(config, name="transform")
+
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ self.input_embeddings = input_embeddings
+
+ def build(self, input_shape=None):
+ self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "transform", None) is not None:
+ with tf.name_scope(self.transform.name):
+ self.transform.build(None)
+
+ def get_output_embeddings(self) -> keras.layers.Layer:
+ return self.input_embeddings
+
+ def set_output_embeddings(self, value: tf.Variable):
+ self.input_embeddings.weight = value
+ self.input_embeddings.vocab_size = shape_list(value)[0]
+
+ def get_bias(self) -> Dict[str, tf.Variable]:
+ return {"bias": self.bias}
+
+ def set_bias(self, value: tf.Variable):
+ self.bias = value["bias"]
+ self.config.vocab_size = shape_list(value["bias"])[0]
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.transform(hidden_states=hidden_states)
+ seq_length = shape_list(hidden_states)[1]
+ hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
+ hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
+ hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+ hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+ return hidden_states
+
+
+class TFDebertaOnlyMLMHead(keras.layers.Layer):
+ def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
+ super().__init__(**kwargs)
+ self.predictions = TFDebertaLMPredictionHead(config, input_embeddings, name="predictions")
+
+ def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
+ prediction_scores = self.predictions(hidden_states=sequence_output)
+
+ return prediction_scores
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "predictions", None) is not None:
+ with tf.name_scope(self.predictions.name):
+ self.predictions.build(None)
+
+
+# @keras_serializable
+class TFDebertaMainLayer(keras.layers.Layer):
+ config_class = DebertaConfig
+
+ def __init__(self, config: DebertaConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.config = config
+
+ self.embeddings = TFDebertaEmbeddings(config, name="embeddings")
+ self.encoder = TFDebertaEncoder(config, name="encoder")
+
+ def get_input_embeddings(self) -> keras.layers.Layer:
+ return self.embeddings
+
+ def set_input_embeddings(self, value: tf.Variable):
+ self.embeddings.weight = value
+ self.embeddings.vocab_size = shape_list(value)[0]
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ raise NotImplementedError
+
+ @unpack_inputs
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ input_shape = shape_list(input_ids)
+ elif inputs_embeds is not None:
+ input_shape = shape_list(inputs_embeds)[:-1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if attention_mask is None:
+ attention_mask = tf.fill(dims=input_shape, value=1)
+
+ if token_type_ids is None:
+ token_type_ids = tf.fill(dims=input_shape, value=0)
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ position_ids=position_ids,
+ token_type_ids=token_type_ids,
+ inputs_embeds=inputs_embeds,
+ mask=attention_mask,
+ training=training,
+ )
+
+ encoder_outputs = self.encoder(
+ hidden_states=embedding_output,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = encoder_outputs[0]
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[1:]
+
+ return TFBaseModelOutput(
+ last_hidden_state=sequence_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "embeddings", None) is not None:
+ with tf.name_scope(self.embeddings.name):
+ self.embeddings.build(None)
+ if getattr(self, "encoder", None) is not None:
+ with tf.name_scope(self.encoder.name):
+ self.encoder.build(None)
+
+
+class TFDebertaPreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = DebertaConfig
+ base_model_prefix = "deberta"
+
+
+DEBERTA_START_DOCSTRING = r"""
+ The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+ Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+ on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+ improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+ This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+ as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+ behavior.
+
+
+
+ TensorFlow models and layers in `transformers` accept two formats as input:
+
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional argument.
+
+ The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+ and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+ pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+ format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+ the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+ positional argument:
+
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+ Note that when creating models and layers with
+ [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+ about any of this, as you can just pass inputs like you would to any other Python function!
+
+
+
+ Parameters:
+ config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEBERTA_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
+
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
+
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput``] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+ DEBERTA_START_DOCSTRING,
+)
+class TFDebertaModel(TFDebertaPreTrainedModel):
+ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.deberta = TFDebertaMainLayer(config, name="deberta")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFBaseModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+
+
+@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
+class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLoss):
+ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ if config.is_decoder:
+ logger.warning(
+ "If you want to use `TFDebertaForMaskedLM` make sure `config.is_decoder=False` for "
+ "bi-directional self-attention."
+ )
+
+ self.deberta = TFDebertaMainLayer(config, name="deberta")
+ self.mlm = TFDebertaOnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")
+
+ def get_lm_head(self) -> keras.layers.Layer:
+ return self.mlm.predictions
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFMaskedLMOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ """
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = outputs[0]
+ prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFMaskedLMOutput(
+ loss=loss,
+ logits=prediction_scores,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+ if getattr(self, "mlm", None) is not None:
+ with tf.name_scope(self.mlm.name):
+ self.mlm.build(None)
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+ pooled output) e.g. for GLUE tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceClassificationLoss):
+ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.num_labels = config.num_labels
+
+ self.deberta = TFDebertaMainLayer(config, name="deberta")
+ self.pooler = TFDebertaContextPooler(config, name="pooler")
+
+ drop_out = getattr(config, "cls_dropout", None)
+ drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+ self.dropout = TFDebertaStableDropout(drop_out, name="cls_dropout")
+ self.classifier = keras.layers.Dense(
+ units=config.num_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="classifier",
+ )
+ self.output_dim = self.pooler.output_dim
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFSequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = outputs[0]
+ pooled_output = self.pooler(sequence_output, training=training)
+ pooled_output = self.dropout(pooled_output, training=training)
+ logits = self.classifier(pooled_output)
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+
+ return ((loss,) + output) if loss is not None else output
+
+ return TFSequenceClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+ if getattr(self, "pooler", None) is not None:
+ with tf.name_scope(self.pooler.name):
+ self.pooler.build(None)
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.output_dim])
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+ Named-Entity-Recognition (NER) tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassificationLoss):
+ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.num_labels = config.num_labels
+
+ self.deberta = TFDebertaMainLayer(config, name="deberta")
+ self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+ self.classifier = keras.layers.Dense(
+ units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+ )
+ self.config = config
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFTokenClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+ """
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = outputs[0]
+ sequence_output = self.dropout(sequence_output, training=training)
+ logits = self.classifier(inputs=sequence_output)
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFTokenClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+ layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnsweringLoss):
+ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.num_labels = config.num_labels
+
+ self.deberta = TFDebertaMainLayer(config, name="deberta")
+ self.qa_outputs = keras.layers.Dense(
+ units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+ )
+ self.config = config
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFQuestionAnsweringModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ start_positions: np.ndarray | tf.Tensor | None = None,
+ end_positions: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
+ r"""
+ start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ """
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = outputs[0]
+ logits = self.qa_outputs(inputs=sequence_output)
+ start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
+ start_logits = tf.squeeze(input=start_logits, axis=-1)
+ end_logits = tf.squeeze(input=end_logits, axis=-1)
+ loss = None
+
+ if start_positions is not None and end_positions is not None:
+ labels = {"start_position": start_positions}
+ labels["end_position"] = end_positions
+ loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
+
+ if not return_dict:
+ output = (start_logits, end_logits) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFQuestionAnsweringModelOutput(
+ loss=loss,
+ start_logits=start_logits,
+ end_logits=end_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+ if getattr(self, "qa_outputs", None) is not None:
+ with tf.name_scope(self.qa_outputs.name):
+ self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+ "TFDebertaForMaskedLM",
+ "TFDebertaForQuestionAnswering",
+ "TFDebertaForSequenceClassification",
+ "TFDebertaForTokenClassification",
+ "TFDebertaModel",
+ "TFDebertaPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/deberta/tokenization_deberta.py b/docs/transformers/build/lib/transformers/models/deberta/tokenization_deberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..63933c1d2a32fbf1050697a660d0ad83f2b1c2d8
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta/tokenization_deberta.py
@@ -0,0 +1,396 @@
+# coding=utf-8
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for model DeBERTa."""
+
+import json
+import os
+from typing import List, Optional, Tuple
+
+import regex as re
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+
+
+# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
+def bytes_to_unicode():
+ """
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+ characters the bpe code barfs on.
+
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+ tables between utf-8 bytes and unicode strings.
+ """
+ bs = (
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+ )
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8 + n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+
+# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
+def get_pairs(word):
+ """
+ Return set of symbol pairs in a word.
+
+ Word is represented as tuple of symbols (symbols being variable-length strings).
+ """
+ pairs = set()
+ prev_char = word[0]
+ for char in word[1:]:
+ pairs.add((prev_char, char))
+ prev_char = char
+ return pairs
+
+
+class DebertaTokenizer(PreTrainedTokenizer):
+ """
+ Construct a DeBERTa tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+ ```python
+ >>> from transformers import DebertaTokenizer
+
+ >>> tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
+ >>> tokenizer("Hello world")["input_ids"]
+ [1, 31414, 232, 2]
+
+ >>> tokenizer(" Hello world")["input_ids"]
+ [1, 20920, 232, 2]
+ ```
+
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+ call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+
+
+ When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
+
+
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+ this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ bos_token (`str`, *optional*, defaults to `"[CLS]"`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `"[SEP]"`):
+ The end of sequence token.
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+ sequence classification or for a text and a question for question answering. It is also used as the last
+ token of a sequence built with special tokens.
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+ The token used for padding, for example when batching sequences of different lengths.
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+ The token used for masking values. This is the token used when training this model with masked language
+ modeling. This is the token which the model will try to predict.
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
+ Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+ other word. (Deberta tokenizer detect beginning of words by the preceding space).
+ add_bos_token (`bool`, *optional*, defaults to `False`):
+ Whether or not to add an initial <|endoftext|> to the input. This allows to treat the leading word just as
+ any other word.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
+
+ def __init__(
+ self,
+ vocab_file,
+ merges_file,
+ errors="replace",
+ bos_token="[CLS]",
+ eos_token="[SEP]",
+ sep_token="[SEP]",
+ cls_token="[CLS]",
+ unk_token="[UNK]",
+ pad_token="[PAD]",
+ mask_token="[MASK]",
+ add_prefix_space=False,
+ add_bos_token=False,
+ **kwargs,
+ ):
+ bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
+ sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
+ cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
+ unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
+
+ # Mask token behave like a normal word, i.e. include the space before it
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+ self.add_bos_token = add_bos_token
+
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
+ self.encoder = json.load(vocab_handle)
+ self.decoder = {v: k for k, v in self.encoder.items()}
+ self.errors = errors # how to handle errors in decoding
+ self.byte_encoder = bytes_to_unicode()
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+ with open(merges_file, encoding="utf-8") as merges_handle:
+ bpe_merges = merges_handle.read().split("\n")[1:-1]
+ bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+ self.cache = {}
+ self.add_prefix_space = add_prefix_space
+
+ # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+ self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ add_bos_token=add_bos_token,
+ **kwargs,
+ )
+
+ @property
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size
+ def vocab_size(self):
+ return len(self.encoder)
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
+ def get_vocab(self):
+ return dict(self.encoder, **self.added_tokens_encoder)
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
+ def bpe(self, token):
+ if token in self.cache:
+ return self.cache[token]
+ word = tuple(token)
+ pairs = get_pairs(word)
+
+ if not pairs:
+ return token
+
+ while True:
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+ if bigram not in self.bpe_ranks:
+ break
+ first, second = bigram
+ new_word = []
+ i = 0
+ while i < len(word):
+ try:
+ j = word.index(first, i)
+ except ValueError:
+ new_word.extend(word[i:])
+ break
+ else:
+ new_word.extend(word[i:j])
+ i = j
+
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+ new_word.append(first + second)
+ i += 2
+ else:
+ new_word.append(word[i])
+ i += 1
+ new_word = tuple(new_word)
+ word = new_word
+ if len(word) == 1:
+ break
+ else:
+ pairs = get_pairs(word)
+ word = " ".join(word)
+ self.cache[token] = word
+ return word
+
+ def build_inputs_with_special_tokens(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+ adding special tokens. A DeBERTa sequence has the following format:
+
+ - single sequence: [CLS] X [SEP]
+ - pair of sequences: [CLS] A [SEP] B [SEP]
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs to which the special tokens will be added.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+ """
+ if token_ids_1 is None:
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+ cls = [self.cls_token_id]
+ sep = [self.sep_token_id]
+ return cls + token_ids_0 + sep + token_ids_1 + sep
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ """
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is None:
+ return [1] + ([0] * len(token_ids_0)) + [1]
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+ sequence pair mask has the following format:
+
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
+
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+ """
+ sep = [self.sep_token_id]
+ cls = [self.cls_token_id]
+
+ if token_ids_1 is None:
+ return len(cls + token_ids_0 + sep) * [0]
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
+ def _tokenize(self, text):
+ """Tokenize a string."""
+ bpe_tokens = []
+ for token in re.findall(self.pat, text):
+ token = "".join(
+ self.byte_encoder[b] for b in token.encode("utf-8")
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+ return bpe_tokens
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ return self.decoder.get(index)
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ text = "".join(tokens)
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+ return text
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+ merge_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+ )
+
+ with open(vocab_file, "w", encoding="utf-8") as f:
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+ index = 0
+ with open(merge_file, "w", encoding="utf-8") as writer:
+ writer.write("#version: 0.2\n")
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+ if index != token_index:
+ logger.warning(
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+ " Please check that the tokenizer is not corrupted!"
+ )
+ index = token_index
+ writer.write(" ".join(bpe_tokens) + "\n")
+ index += 1
+
+ return vocab_file, merge_file
+
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+ add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+ if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+ text = " " + text
+ return (text, kwargs)
+
+
+__all__ = ["DebertaTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/deberta/tokenization_deberta_fast.py b/docs/transformers/build/lib/transformers/models/deberta/tokenization_deberta_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..368f29b522bc48e218e7f9b782bd88cf818aec62
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta/tokenization_deberta_fast.py
@@ -0,0 +1,239 @@
+# coding=utf-8
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for model DeBERTa."""
+
+from typing import List, Optional, Tuple
+
+from ...tokenization_utils_base import AddedToken, BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_deberta import DebertaTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class DebertaTokenizerFast(PreTrainedTokenizerFast):
+ """
+ Construct a "fast" DeBERTa tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+ Byte-Pair-Encoding.
+
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+ ```python
+ >>> from transformers import DebertaTokenizerFast
+
+ >>> tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
+ >>> tokenizer("Hello world")["input_ids"]
+ [1, 31414, 232, 2]
+
+ >>> tokenizer(" Hello world")["input_ids"]
+ [1, 20920, 232, 2]
+ ```
+
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+ the model was not pretrained this way, it might yield a decrease in performance.
+
+
+
+ When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+
+
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+ refer to this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`, *optional*):
+ Path to the vocabulary file.
+ merges_file (`str`, *optional*):
+ Path to the merges file.
+ tokenizer_file (`str`, *optional*):
+ The path to a tokenizer file to use instead of the vocab file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ bos_token (`str`, *optional*, defaults to `"[CLS]"`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `"[SEP]"`):
+ The end of sequence token.
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+ sequence classification or for a text and a question for question answering. It is also used as the last
+ token of a sequence built with special tokens.
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+ The token used for padding, for example when batching sequences of different lengths.
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+ The token used for masking values. This is the token used when training this model with masked language
+ modeling. This is the token which the model will try to predict.
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
+ Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+ other word. (Deberta tokenizer detect beginning of words by the preceding space).
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
+ slow_tokenizer_class = DebertaTokenizer
+
+ def __init__(
+ self,
+ vocab_file=None,
+ merges_file=None,
+ tokenizer_file=None,
+ errors="replace",
+ bos_token="[CLS]",
+ eos_token="[SEP]",
+ sep_token="[SEP]",
+ cls_token="[CLS]",
+ unk_token="[UNK]",
+ pad_token="[PAD]",
+ mask_token="[MASK]",
+ add_prefix_space=False,
+ **kwargs,
+ ):
+ super().__init__(
+ vocab_file,
+ merges_file,
+ tokenizer_file=tokenizer_file,
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ **kwargs,
+ )
+ self.add_bos_token = kwargs.pop("add_bos_token", False)
+
+ @property
+ def mask_token(self) -> str:
+ """
+ `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
+ having been set.
+
+ Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
+ comprise the space before the *[MASK]*.
+ """
+ if self._mask_token is None:
+ if self.verbose:
+ logger.error("Using mask_token, but it is not set yet.")
+ return None
+ return str(self._mask_token)
+
+ @mask_token.setter
+ def mask_token(self, value):
+ """
+ Overriding the default behavior of the mask token to have it eat the space before it.
+ """
+ # Mask token behave like a normal word, i.e. include the space before it
+ # So we set lstrip to True
+ value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
+ self._mask_token = value
+
+ def build_inputs_with_special_tokens(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+ adding special tokens. A DeBERTa sequence has the following format:
+
+ - single sequence: [CLS] X [SEP]
+ - pair of sequences: [CLS] A [SEP] B [SEP]
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs to which the special tokens will be added.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+ """
+ if token_ids_1 is None:
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+ cls = [self.cls_token_id]
+ sep = [self.sep_token_id]
+ return cls + token_ids_0 + sep + token_ids_1 + sep
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+ sequence pair mask has the following format:
+
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
+
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+ """
+ sep = [self.sep_token_id]
+ cls = [self.cls_token_id]
+
+ if token_ids_1 is None:
+ return len(cls + token_ids_0 + sep) * [0]
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus
+ def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+ is_split_into_words = kwargs.get("is_split_into_words", False)
+ assert self.add_prefix_space or not is_split_into_words, (
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+ "to use it with pretokenized inputs."
+ )
+
+ return super()._batch_encode_plus(*args, **kwargs)
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._encode_plus
+ def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+ is_split_into_words = kwargs.get("is_split_into_words", False)
+
+ assert self.add_prefix_space or not is_split_into_words, (
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+ "to use it with pretokenized inputs."
+ )
+
+ return super()._encode_plus(*args, **kwargs)
+
+ # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+ return tuple(files)
+
+
+__all__ = ["DebertaTokenizerFast"]
diff --git a/docs/transformers/build/lib/transformers/models/deberta_v2/__init__.py b/docs/transformers/build/lib/transformers/models/deberta_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c42c9c502862416b8942b9150567a33b6ec9301
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta_v2/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_deberta_v2 import *
+ from .modeling_deberta_v2 import *
+ from .modeling_tf_deberta_v2 import *
+ from .tokenization_deberta_v2 import *
+ from .tokenization_deberta_v2_fast import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/deberta_v2/configuration_deberta_v2.py b/docs/transformers/build/lib/transformers/models/deberta_v2/configuration_deberta_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1aaa3f04668300f41380c946755726aac0236f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2020, Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DeBERTa-v2 model configuration"""
+
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+ from ... import FeatureExtractionMixin, PreTrainedTokenizerBase, TensorType
+
+
+logger = logging.get_logger(__name__)
+
+
+class DebertaV2Config(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
+ DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the DeBERTa
+ [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Arguments:
+ vocab_size (`int`, *optional*, defaults to 128100):
+ Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`DebertaV2Model`].
+ hidden_size (`int`, *optional*, defaults to 1536):
+ Dimensionality of the encoder layers and the pooler layer.
+ num_hidden_layers (`int`, *optional*, defaults to 24):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 24):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ intermediate_size (`int`, *optional*, defaults to 6144):
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"`
+ are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+ The dropout ratio for the attention probabilities.
+ max_position_embeddings (`int`, *optional*, defaults to 512):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ type_vocab_size (`int`, *optional*, defaults to 0):
+ The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-7):
+ The epsilon used by the layer normalization layers.
+ relative_attention (`bool`, *optional*, defaults to `True`):
+ Whether use relative position encoding.
+ max_relative_positions (`int`, *optional*, defaults to -1):
+ The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value
+ as `max_position_embeddings`.
+ pad_token_id (`int`, *optional*, defaults to 0):
+ The value used to pad input_ids.
+ position_biased_input (`bool`, *optional*, defaults to `True`):
+ Whether add absolute position embedding to content embedding.
+ pos_att_type (`List[str]`, *optional*):
+ The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
+ `["p2c", "c2p"]`, `["p2c", "c2p"]`.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the layer normalization layers.
+ legacy (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should use the legacy `LegacyDebertaOnlyMLMHead`, which does not work properly
+ for mask infilling tasks.
+
+ Example:
+
+ ```python
+ >>> from transformers import DebertaV2Config, DebertaV2Model
+
+ >>> # Initializing a DeBERTa-v2 microsoft/deberta-v2-xlarge style configuration
+ >>> configuration = DebertaV2Config()
+
+ >>> # Initializing a model (with random weights) from the microsoft/deberta-v2-xlarge style configuration
+ >>> model = DebertaV2Model(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "deberta-v2"
+
+ def __init__(
+ self,
+ vocab_size=128100,
+ hidden_size=1536,
+ num_hidden_layers=24,
+ num_attention_heads=24,
+ intermediate_size=6144,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ max_position_embeddings=512,
+ type_vocab_size=0,
+ initializer_range=0.02,
+ layer_norm_eps=1e-7,
+ relative_attention=False,
+ max_relative_positions=-1,
+ pad_token_id=0,
+ position_biased_input=True,
+ pos_att_type=None,
+ pooler_dropout=0,
+ pooler_hidden_act="gelu",
+ legacy=True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.max_position_embeddings = max_position_embeddings
+ self.type_vocab_size = type_vocab_size
+ self.initializer_range = initializer_range
+ self.relative_attention = relative_attention
+ self.max_relative_positions = max_relative_positions
+ self.pad_token_id = pad_token_id
+ self.position_biased_input = position_biased_input
+
+ # Backwards compatibility
+ if isinstance(pos_att_type, str):
+ pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")]
+
+ self.pos_att_type = pos_att_type
+ self.vocab_size = vocab_size
+ self.layer_norm_eps = layer_norm_eps
+
+ self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size)
+ self.pooler_dropout = pooler_dropout
+ self.pooler_hidden_act = pooler_hidden_act
+ self.legacy = legacy
+
+
+class DebertaV2OnnxConfig(OnnxConfig):
+ @property
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
+ if self.task == "multiple-choice":
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+ else:
+ dynamic_axis = {0: "batch", 1: "sequence"}
+ if self._config.type_vocab_size > 0:
+ return OrderedDict(
+ [("input_ids", dynamic_axis), ("attention_mask", dynamic_axis), ("token_type_ids", dynamic_axis)]
+ )
+ else:
+ return OrderedDict([("input_ids", dynamic_axis), ("attention_mask", dynamic_axis)])
+
+ @property
+ def default_onnx_opset(self) -> int:
+ return 12
+
+ def generate_dummy_inputs(
+ self,
+ preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
+ batch_size: int = -1,
+ seq_length: int = -1,
+ num_choices: int = -1,
+ is_pair: bool = False,
+ framework: Optional["TensorType"] = None,
+ num_channels: int = 3,
+ image_width: int = 40,
+ image_height: int = 40,
+ tokenizer: "PreTrainedTokenizerBase" = None,
+ ) -> Mapping[str, Any]:
+ dummy_inputs = super().generate_dummy_inputs(preprocessor=preprocessor, framework=framework)
+ if self._config.type_vocab_size == 0 and "token_type_ids" in dummy_inputs:
+ del dummy_inputs["token_type_ids"]
+ return dummy_inputs
+
+
+__all__ = ["DebertaV2Config", "DebertaV2OnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/deberta_v2/modeling_deberta_v2.py b/docs/transformers/build/lib/transformers/models/deberta_v2/modeling_deberta_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..576c4ce8799d9c26db5de749139f2a5c3b0725ef
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -0,0 +1,1523 @@
+# coding=utf-8
+# Copyright 2020 Microsoft and the Hugging Face Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DeBERTa-v2 model."""
+
+from collections.abc import Sequence
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+ BaseModelOutput,
+ MaskedLMOutput,
+ MultipleChoiceModelOutput,
+ QuestionAnsweringModelOutput,
+ SequenceClassifierOutput,
+ TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_deberta_v2 import DebertaV2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DebertaV2Config"
+_CHECKPOINT_FOR_DOC = "microsoft/deberta-v2-xlarge"
+_QA_TARGET_START_INDEX = 2
+_QA_TARGET_END_INDEX = 9
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2SelfOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+@torch.jit.script
+def make_log_bucket_position(relative_pos, bucket_size: int, max_position: int):
+ sign = torch.sign(relative_pos)
+ mid = bucket_size // 2
+ abs_pos = torch.where(
+ (relative_pos < mid) & (relative_pos > -mid),
+ torch.tensor(mid - 1).type_as(relative_pos),
+ torch.abs(relative_pos),
+ )
+ log_pos = (
+ torch.ceil(torch.log(abs_pos / mid) / torch.log(torch.tensor((max_position - 1) / mid)) * (mid - 1)) + mid
+ )
+ bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos), log_pos * sign)
+ return bucket_pos
+
+
+def build_relative_position(query_layer, key_layer, bucket_size: int = -1, max_position: int = -1):
+ """
+ Build relative position according to the query and key
+
+ We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+ \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+ P_k\\)
+
+ Args:
+ query_size (int): the length of query
+ key_size (int): the length of key
+ bucket_size (int): the size of position bucket
+ max_position (int): the maximum allowed absolute position
+ device (`torch.device`): the device on which tensors will be created.
+
+ Return:
+ `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+ """
+ query_size = query_layer.size(-2)
+ key_size = key_layer.size(-2)
+
+ q_ids = torch.arange(query_size, dtype=torch.long, device=query_layer.device)
+ k_ids = torch.arange(key_size, dtype=torch.long, device=key_layer.device)
+ rel_pos_ids = q_ids[:, None] - k_ids[None, :]
+ if bucket_size > 0 and max_position > 0:
+ rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
+ rel_pos_ids = rel_pos_ids.to(torch.long)
+ rel_pos_ids = rel_pos_ids[:query_size, :]
+ rel_pos_ids = rel_pos_ids.unsqueeze(0)
+ return rel_pos_ids
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+ return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+ return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+ return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))
+
+
+@torch.jit.script
+def scaled_size_sqrt(query_layer: torch.Tensor, scale_factor: int):
+ return torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
+
+
+@torch.jit.script
+def build_rpos(query_layer, key_layer, relative_pos, position_buckets: int, max_relative_positions: int):
+ if key_layer.size(-2) != query_layer.size(-2):
+ return build_relative_position(
+ key_layer,
+ key_layer,
+ bucket_size=position_buckets,
+ max_position=max_relative_positions,
+ )
+ else:
+ return relative_pos
+
+
+class DisentangledSelfAttention(nn.Module):
+ """
+ Disentangled self-attention module
+
+ Parameters:
+ config (`DebertaV2Config`):
+ A model config class instance with the configuration to build a new model. The schema is similar to
+ *BertConfig*, for more details, please refer [`DebertaV2Config`]
+
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ if config.hidden_size % config.num_attention_heads != 0:
+ raise ValueError(
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+ f"heads ({config.num_attention_heads})"
+ )
+ self.num_attention_heads = config.num_attention_heads
+ _attention_head_size = config.hidden_size // config.num_attention_heads
+ self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+ self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+ self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+
+ self.share_att_key = getattr(config, "share_att_key", False)
+ self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+ self.relative_attention = getattr(config, "relative_attention", False)
+
+ if self.relative_attention:
+ self.position_buckets = getattr(config, "position_buckets", -1)
+ self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+ if self.max_relative_positions < 1:
+ self.max_relative_positions = config.max_position_embeddings
+ self.pos_ebd_size = self.max_relative_positions
+ if self.position_buckets > 0:
+ self.pos_ebd_size = self.position_buckets
+
+ self.pos_dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ if not self.share_att_key:
+ if "c2p" in self.pos_att_type:
+ self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+ if "p2c" in self.pos_att_type:
+ self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+ def transpose_for_scores(self, x, attention_heads) -> torch.Tensor:
+ new_x_shape = x.size()[:-1] + (attention_heads, -1)
+ x = x.view(new_x_shape)
+ return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask,
+ output_attentions=False,
+ query_states=None,
+ relative_pos=None,
+ rel_embeddings=None,
+ ):
+ """
+ Call the module
+
+ Args:
+ hidden_states (`torch.FloatTensor`):
+ Input states to the module usually the output from previous layer, it will be the Q,K and V in
+ *Attention(Q,K,V)*
+
+ attention_mask (`torch.BoolTensor`):
+ An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+ sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
+ th token.
+
+ output_attentions (`bool`, *optional*):
+ Whether return the attention matrix.
+
+ query_states (`torch.FloatTensor`, *optional*):
+ The *Q* state in *Attention(Q,K,V)*.
+
+ relative_pos (`torch.LongTensor`):
+ The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+ values ranging in [*-max_relative_positions*, *max_relative_positions*].
+
+ rel_embeddings (`torch.FloatTensor`):
+ The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+ \\text{max_relative_positions}\\), *hidden_size*].
+
+
+ """
+ if query_states is None:
+ query_states = hidden_states
+ query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads)
+ key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads)
+ value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads)
+
+ rel_att = None
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ scale_factor = 1
+ if "c2p" in self.pos_att_type:
+ scale_factor += 1
+ if "p2c" in self.pos_att_type:
+ scale_factor += 1
+ scale = scaled_size_sqrt(query_layer, scale_factor)
+ attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype))
+ if self.relative_attention:
+ rel_embeddings = self.pos_dropout(rel_embeddings)
+ rel_att = self.disentangled_attention_bias(
+ query_layer, key_layer, relative_pos, rel_embeddings, scale_factor
+ )
+
+ if rel_att is not None:
+ attention_scores = attention_scores + rel_att
+ attention_scores = attention_scores
+ attention_scores = attention_scores.view(
+ -1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1)
+ )
+
+ attention_mask = attention_mask.bool()
+ attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min)
+ # bsz x height x length x dimension
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+ attention_probs = self.dropout(attention_probs)
+ context_layer = torch.bmm(
+ attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer
+ )
+ context_layer = (
+ context_layer.view(-1, self.num_attention_heads, context_layer.size(-2), context_layer.size(-1))
+ .permute(0, 2, 1, 3)
+ .contiguous()
+ )
+ new_context_layer_shape = context_layer.size()[:-2] + (-1,)
+ context_layer = context_layer.view(new_context_layer_shape)
+ if not output_attentions:
+ return (context_layer, None)
+ return (context_layer, attention_probs)
+
+ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
+ if relative_pos is None:
+ relative_pos = build_relative_position(
+ query_layer,
+ key_layer,
+ bucket_size=self.position_buckets,
+ max_position=self.max_relative_positions,
+ )
+ if relative_pos.dim() == 2:
+ relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+ elif relative_pos.dim() == 3:
+ relative_pos = relative_pos.unsqueeze(1)
+ # bsz x height x query x key
+ elif relative_pos.dim() != 4:
+ raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
+
+ att_span = self.pos_ebd_size
+ relative_pos = relative_pos.to(device=query_layer.device, dtype=torch.long)
+
+ rel_embeddings = rel_embeddings[0 : att_span * 2, :].unsqueeze(0)
+ if self.share_att_key:
+ pos_query_layer = self.transpose_for_scores(
+ self.query_proj(rel_embeddings), self.num_attention_heads
+ ).repeat(query_layer.size(0) // self.num_attention_heads, 1, 1)
+ pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads).repeat(
+ query_layer.size(0) // self.num_attention_heads, 1, 1
+ )
+ else:
+ if "c2p" in self.pos_att_type:
+ pos_key_layer = self.transpose_for_scores(
+ self.pos_key_proj(rel_embeddings), self.num_attention_heads
+ ).repeat(query_layer.size(0) // self.num_attention_heads, 1, 1) # .split(self.all_head_size, dim=-1)
+ if "p2c" in self.pos_att_type:
+ pos_query_layer = self.transpose_for_scores(
+ self.pos_query_proj(rel_embeddings), self.num_attention_heads
+ ).repeat(query_layer.size(0) // self.num_attention_heads, 1, 1) # .split(self.all_head_size, dim=-1)
+
+ score = 0
+ # content->position
+ if "c2p" in self.pos_att_type:
+ scale = scaled_size_sqrt(pos_key_layer, scale_factor)
+ c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
+ c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
+ c2p_att = torch.gather(
+ c2p_att,
+ dim=-1,
+ index=c2p_pos.squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)]),
+ )
+ score += c2p_att / scale.to(dtype=c2p_att.dtype)
+
+ # position->content
+ if "p2c" in self.pos_att_type:
+ scale = scaled_size_sqrt(pos_query_layer, scale_factor)
+ r_pos = build_rpos(
+ query_layer,
+ key_layer,
+ relative_pos,
+ self.max_relative_positions,
+ self.position_buckets,
+ )
+ p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1)
+ p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2))
+ p2c_att = torch.gather(
+ p2c_att,
+ dim=-1,
+ index=p2c_pos.squeeze(0).expand([query_layer.size(0), key_layer.size(-2), key_layer.size(-2)]),
+ ).transpose(-1, -2)
+ score += p2c_att / scale.to(dtype=p2c_att.dtype)
+
+ return score
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2
+class DebertaV2Attention(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.self = DisentangledSelfAttention(config)
+ self.output = DebertaV2SelfOutput(config)
+ self.config = config
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask,
+ output_attentions: bool = False,
+ query_states=None,
+ relative_pos=None,
+ rel_embeddings=None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+ self_output, att_matrix = self.self(
+ hidden_states,
+ attention_mask,
+ output_attentions,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ )
+ if query_states is None:
+ query_states = hidden_states
+ attention_output = self.output(self_output, query_states)
+
+ if output_attentions:
+ return (attention_output, att_matrix)
+ else:
+ return (attention_output, None)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2
+class DebertaV2Intermediate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2Output(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.config = config
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2
+class DebertaV2Layer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.attention = DebertaV2Attention(config)
+ self.intermediate = DebertaV2Intermediate(config)
+ self.output = DebertaV2Output(config)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask,
+ query_states=None,
+ relative_pos=None,
+ rel_embeddings=None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+ attention_output, att_matrix = self.attention(
+ hidden_states,
+ attention_mask,
+ output_attentions=output_attentions,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ )
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+
+ if output_attentions:
+ return (layer_output, att_matrix)
+ else:
+ return (layer_output, None)
+
+
+class ConvLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ kernel_size = getattr(config, "conv_kernel_size", 3)
+ groups = getattr(config, "conv_groups", 1)
+ self.conv_act = getattr(config, "conv_act", "tanh")
+ self.conv = nn.Conv1d(
+ config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups
+ )
+ self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.config = config
+
+ def forward(self, hidden_states, residual_states, input_mask):
+ out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous()
+ rmask = (1 - input_mask).bool()
+ out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
+ out = ACT2FN[self.conv_act](self.dropout(out))
+
+ layer_norm_input = residual_states + out
+ output = self.LayerNorm(layer_norm_input).to(layer_norm_input)
+
+ if input_mask is None:
+ output_states = output
+ else:
+ if input_mask.dim() != layer_norm_input.dim():
+ if input_mask.dim() == 4:
+ input_mask = input_mask.squeeze(1).squeeze(1)
+ input_mask = input_mask.unsqueeze(2)
+
+ input_mask = input_mask.to(output.dtype)
+ output_states = output * input_mask
+
+ return output_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm,Deberta->DebertaV2
+class DebertaV2Embeddings(nn.Module):
+ """Construct the embeddings from word, position and token_type embeddings."""
+
+ def __init__(self, config):
+ super().__init__()
+ pad_token_id = getattr(config, "pad_token_id", 0)
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+ self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
+
+ self.position_biased_input = getattr(config, "position_biased_input", True)
+ if not self.position_biased_input:
+ self.position_embeddings = None
+ else:
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
+
+ if config.type_vocab_size > 0:
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
+ else:
+ self.token_type_embeddings = None
+
+ if self.embedding_size != config.hidden_size:
+ self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
+ else:
+ self.embed_proj = None
+
+ self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.config = config
+
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+ self.register_buffer(
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+ )
+
+ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
+ if input_ids is not None:
+ input_shape = input_ids.size()
+ else:
+ input_shape = inputs_embeds.size()[:-1]
+
+ seq_length = input_shape[1]
+
+ if position_ids is None:
+ position_ids = self.position_ids[:, :seq_length]
+
+ if token_type_ids is None:
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.word_embeddings(input_ids)
+
+ if self.position_embeddings is not None:
+ position_embeddings = self.position_embeddings(position_ids.long())
+ else:
+ position_embeddings = torch.zeros_like(inputs_embeds)
+
+ embeddings = inputs_embeds
+ if self.position_biased_input:
+ embeddings = embeddings + position_embeddings
+ if self.token_type_embeddings is not None:
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
+ embeddings = embeddings + token_type_embeddings
+
+ if self.embed_proj is not None:
+ embeddings = self.embed_proj(embeddings)
+
+ embeddings = self.LayerNorm(embeddings)
+
+ if mask is not None:
+ if mask.dim() != embeddings.dim():
+ if mask.dim() == 4:
+ mask = mask.squeeze(1).squeeze(1)
+ mask = mask.unsqueeze(2)
+ mask = mask.to(embeddings.dtype)
+
+ embeddings = embeddings * mask
+
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class DebertaV2Encoder(nn.Module):
+ """Modified BertEncoder with relative position bias support"""
+
+ def __init__(self, config):
+ super().__init__()
+
+ self.layer = nn.ModuleList([DebertaV2Layer(config) for _ in range(config.num_hidden_layers)])
+ self.relative_attention = getattr(config, "relative_attention", False)
+
+ if self.relative_attention:
+ self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+ if self.max_relative_positions < 1:
+ self.max_relative_positions = config.max_position_embeddings
+
+ self.position_buckets = getattr(config, "position_buckets", -1)
+ pos_ebd_size = self.max_relative_positions * 2
+
+ if self.position_buckets > 0:
+ pos_ebd_size = self.position_buckets * 2
+
+ self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)
+
+ self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
+
+ if "layer_norm" in self.norm_rel_ebd:
+ self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
+
+ self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None
+ self.gradient_checkpointing = False
+
+ def get_rel_embedding(self):
+ rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+ if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
+ rel_embeddings = self.LayerNorm(rel_embeddings)
+ return rel_embeddings
+
+ def get_attention_mask(self, attention_mask):
+ if attention_mask.dim() <= 2:
+ extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+ attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
+ elif attention_mask.dim() == 3:
+ attention_mask = attention_mask.unsqueeze(1)
+
+ return attention_mask
+
+ def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+ if self.relative_attention and relative_pos is None:
+ if query_states is not None:
+ relative_pos = build_relative_position(
+ query_states,
+ hidden_states,
+ bucket_size=self.position_buckets,
+ max_position=self.max_relative_positions,
+ )
+ else:
+ relative_pos = build_relative_position(
+ hidden_states,
+ hidden_states,
+ bucket_size=self.position_buckets,
+ max_position=self.max_relative_positions,
+ )
+ return relative_pos
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask,
+ output_hidden_states=True,
+ output_attentions=False,
+ query_states=None,
+ relative_pos=None,
+ return_dict=True,
+ ):
+ if attention_mask.dim() <= 2:
+ input_mask = attention_mask
+ else:
+ input_mask = attention_mask.sum(-2) > 0
+ attention_mask = self.get_attention_mask(attention_mask)
+ relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+
+ all_hidden_states: Optional[Tuple[torch.Tensor]] = (hidden_states,) if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ next_kv = hidden_states
+ rel_embeddings = self.get_rel_embedding()
+ for i, layer_module in enumerate(self.layer):
+ if self.gradient_checkpointing and self.training:
+ output_states, attn_weights = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ next_kv,
+ attention_mask,
+ query_states,
+ relative_pos,
+ rel_embeddings,
+ output_attentions,
+ )
+ else:
+ output_states, attn_weights = layer_module(
+ next_kv,
+ attention_mask,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ output_attentions=output_attentions,
+ )
+
+ if output_attentions:
+ all_attentions = all_attentions + (attn_weights,)
+
+ if i == 0 and self.conv is not None:
+ output_states = self.conv(hidden_states, output_states, input_mask)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (output_states,)
+
+ if query_states is not None:
+ query_states = output_states
+ if isinstance(hidden_states, Sequence):
+ next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
+ else:
+ next_kv = output_states
+
+ if not return_dict:
+ return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
+ )
+
+
+class DebertaV2PreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = DebertaV2Config
+ base_model_prefix = "deberta"
+ _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
+ supports_gradient_checkpointing = True
+
+ def _init_weights(self, module):
+ """Initialize the weights."""
+ if isinstance(module, nn.Linear):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.weight.data.fill_(1.0)
+ module.bias.data.zero_()
+ elif isinstance(module, (LegacyDebertaV2LMPredictionHead, DebertaV2LMPredictionHead)):
+ module.bias.data.zero_()
+
+
+DEBERTA_START_DOCSTRING = r"""
+ The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+ Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+ on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+ improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+
+ Parameters:
+ config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEBERTA_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `({0})`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
+
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
+
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+ DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
+class DebertaV2Model(DebertaV2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.embeddings = DebertaV2Embeddings(config)
+ self.encoder = DebertaV2Encoder(config)
+ self.z_steps = 0
+ self.config = config
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embeddings.word_embeddings
+
+ def set_input_embeddings(self, new_embeddings):
+ self.embeddings.word_embeddings = new_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ raise NotImplementedError("The prune function is not implemented in DeBERTa model.")
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+ input_shape = input_ids.size()
+ elif inputs_embeds is not None:
+ input_shape = inputs_embeds.size()[:-1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+ if attention_mask is None:
+ attention_mask = torch.ones(input_shape, device=device)
+ if token_type_ids is None:
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ )
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ attention_mask,
+ output_hidden_states=True,
+ output_attentions=output_attentions,
+ return_dict=return_dict,
+ )
+ encoded_layers = encoder_outputs[1]
+
+ if self.z_steps > 1:
+ hidden_states = encoded_layers[-2]
+ layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
+ query_states = encoded_layers[-1]
+ rel_embeddings = self.encoder.get_rel_embedding()
+ attention_mask = self.encoder.get_attention_mask(attention_mask)
+ rel_pos = self.encoder.get_rel_pos(embedding_output)
+ for layer in layers[1:]:
+ query_states = layer(
+ hidden_states,
+ attention_mask,
+ output_attentions=False,
+ query_states=query_states,
+ relative_pos=rel_pos,
+ rel_embeddings=rel_embeddings,
+ )
+ encoded_layers.append(query_states)
+
+ sequence_output = encoded_layers[-1]
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :]
+
+ return BaseModelOutput(
+ last_hidden_state=sequence_output,
+ hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+# Copied from transformers.models.deberta.modeling_deberta.LegacyDebertaPredictionHeadTransform with Deberta->DebertaV2
+class LegacyDebertaV2PredictionHeadTransform(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+ self.dense = nn.Linear(config.hidden_size, self.embedding_size)
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.transform_act_fn = config.hidden_act
+ self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+ return hidden_states
+
+
+class LegacyDebertaV2LMPredictionHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.transform = LegacyDebertaV2PredictionHeadTransform(config)
+
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)
+
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+ # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+ self.decoder.bias = self.bias
+
+ def _tie_weights(self):
+ self.decoder.bias = self.bias
+
+ def forward(self, hidden_states):
+ hidden_states = self.transform(hidden_states)
+ hidden_states = self.decoder(hidden_states)
+ return hidden_states
+
+
+class LegacyDebertaV2OnlyMLMHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.predictions = LegacyDebertaV2LMPredictionHead(config)
+
+ def forward(self, sequence_output):
+ prediction_scores = self.predictions(sequence_output)
+ return prediction_scores
+
+
+class DebertaV2LMPredictionHead(nn.Module):
+ """https://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270"""
+
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.transform_act_fn = config.hidden_act
+
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=True)
+
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+ # note that the input embeddings must be passed as an argument
+ def forward(self, hidden_states, word_embeddings):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+ hidden_states = torch.matmul(hidden_states, word_embeddings.weight.t()) + self.bias
+ return hidden_states
+
+
+class DebertaV2OnlyMLMHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.lm_head = DebertaV2LMPredictionHead(config)
+
+ # note that the input embeddings must be passed as an argument
+ def forward(self, sequence_output, word_embeddings):
+ prediction_scores = self.lm_head(sequence_output, word_embeddings)
+ return prediction_scores
+
+
+@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
+class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
+ _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+ _keys_to_ignore_on_load_unexpected = r"mask_predictions.*"
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.legacy = config.legacy
+ self.deberta = DebertaV2Model(config)
+ if self.legacy:
+ self.cls = LegacyDebertaV2OnlyMLMHead(config)
+ else:
+ self._tied_weights_keys = ["lm_predictions.lm_head.weight", "deberta.embeddings.word_embeddings.weight"]
+ self.lm_predictions = DebertaV2OnlyMLMHead(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_output_embeddings(self):
+ if self.legacy:
+ return self.cls.predictions.decoder
+ else:
+ return self.lm_predictions.lm_head.dense
+
+ def set_output_embeddings(self, new_embeddings):
+ if self.legacy:
+ self.cls.predictions.decoder = new_embeddings
+ self.cls.predictions.bias = new_embeddings.bias
+ else:
+ self.lm_predictions.lm_head.dense = new_embeddings
+ self.lm_predictions.lm_head.bias = new_embeddings.bias
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=MaskedLMOutput,
+ config_class=_CONFIG_FOR_DOC,
+ mask="[MASK]",
+ )
+ # Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM.forward with Deberta->DebertaV2
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, MaskedLMOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ """
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deberta(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+ if self.legacy:
+ prediction_scores = self.cls(sequence_output)
+ else:
+ prediction_scores = self.lm_predictions(sequence_output, self.deberta.embeddings.word_embeddings)
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[1:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return MaskedLMOutput(
+ loss=masked_lm_loss,
+ logits=prediction_scores,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
+class ContextPooler(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
+ self.dropout = nn.Dropout(config.pooler_dropout)
+ self.config = config
+
+ def forward(self, hidden_states):
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+
+ context_token = hidden_states[:, 0]
+ context_token = self.dropout(context_token)
+ pooled_output = self.dense(context_token)
+ pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
+ return pooled_output
+
+ @property
+ def output_dim(self):
+ return self.config.hidden_size
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+ pooled output) e.g. for GLUE tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ num_labels = getattr(config, "num_labels", 2)
+ self.num_labels = num_labels
+
+ self.deberta = DebertaV2Model(config)
+ self.pooler = ContextPooler(config)
+ output_dim = self.pooler.output_dim
+
+ self.classifier = nn.Linear(output_dim, num_labels)
+ drop_out = getattr(config, "cls_dropout", None)
+ drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+ self.dropout = nn.Dropout(drop_out)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.deberta.get_input_embeddings()
+
+ def set_input_embeddings(self, new_embeddings):
+ self.deberta.set_input_embeddings(new_embeddings)
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=SequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ # Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification.forward with Deberta->DebertaV2
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deberta(
+ input_ids,
+ token_type_ids=token_type_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ encoder_layer = outputs[0]
+ pooled_output = self.pooler(encoder_layer)
+ pooled_output = self.dropout(pooled_output)
+ logits = self.classifier(pooled_output)
+
+ loss = None
+ if labels is not None:
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ # regression task
+ loss_fn = nn.MSELoss()
+ logits = logits.view(-1).to(labels.dtype)
+ loss = loss_fn(logits, labels.view(-1))
+ elif labels.dim() == 1 or labels.size(-1) == 1:
+ label_index = (labels >= 0).nonzero()
+ labels = labels.long()
+ if label_index.size(0) > 0:
+ labeled_logits = torch.gather(
+ logits, 0, label_index.expand(label_index.size(0), logits.size(1))
+ )
+ labels = torch.gather(labels, 0, label_index.view(-1))
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1))
+ else:
+ loss = torch.tensor(0).to(logits)
+ else:
+ log_softmax = nn.LogSoftmax(-1)
+ loss = -((log_softmax(logits) * labels).sum(-1)).mean()
+ elif self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutput(
+ loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+ )
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+ Named-Entity-Recognition (NER) tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
+class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+
+ self.deberta = DebertaV2Model(config)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TokenClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, TokenClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deberta(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+
+ sequence_output = self.dropout(sequence_output)
+ logits = self.classifier(sequence_output)
+
+ loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TokenClassifierOutput(
+ loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+ )
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+ layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+
+ self.deberta = DebertaV2Model(config)
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=QuestionAnsweringModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ qa_target_start_index=_QA_TARGET_START_INDEX,
+ qa_target_end_index=_QA_TARGET_END_INDEX,
+ )
+ # Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering.forward with Deberta->DebertaV2
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ start_positions: Optional[torch.Tensor] = None,
+ end_positions: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+ r"""
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deberta(
+ input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0]
+
+ logits = self.qa_outputs(sequence_output)
+ start_logits, end_logits = logits.split(1, dim=-1)
+ start_logits = start_logits.squeeze(-1).contiguous()
+ end_logits = end_logits.squeeze(-1).contiguous()
+
+ total_loss = None
+ if start_positions is not None and end_positions is not None:
+ # If we are on multi-GPU, split add a dimension
+ if len(start_positions.size()) > 1:
+ start_positions = start_positions.squeeze(-1)
+ if len(end_positions.size()) > 1:
+ end_positions = end_positions.squeeze(-1)
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
+ ignored_index = start_logits.size(1)
+ start_positions = start_positions.clamp(0, ignored_index)
+ end_positions = end_positions.clamp(0, ignored_index)
+
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+ start_loss = loss_fct(start_logits, start_positions)
+ end_loss = loss_fct(end_logits, end_positions)
+ total_loss = (start_loss + end_loss) / 2
+
+ if not return_dict:
+ output = (start_logits, end_logits) + outputs[1:]
+ return ((total_loss,) + output) if total_loss is not None else output
+
+ return QuestionAnsweringModelOutput(
+ loss=total_loss,
+ start_logits=start_logits,
+ end_logits=end_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+ softmax) e.g. for RocStories/SWAG tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ num_labels = getattr(config, "num_labels", 2)
+ self.num_labels = num_labels
+
+ self.deberta = DebertaV2Model(config)
+ self.pooler = ContextPooler(config)
+ output_dim = self.pooler.output_dim
+
+ self.classifier = nn.Linear(output_dim, 1)
+ drop_out = getattr(config, "cls_dropout", None)
+ drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+ self.dropout = nn.Dropout(drop_out)
+
+ self.init_weights()
+
+ def get_input_embeddings(self):
+ return self.deberta.get_input_embeddings()
+
+ def set_input_embeddings(self, new_embeddings):
+ self.deberta.set_input_embeddings(new_embeddings)
+
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=MultipleChoiceModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ token_type_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, MultipleChoiceModelOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+ flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+ flat_inputs_embeds = (
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+ if inputs_embeds is not None
+ else None
+ )
+
+ outputs = self.deberta(
+ flat_input_ids,
+ position_ids=flat_position_ids,
+ token_type_ids=flat_token_type_ids,
+ attention_mask=flat_attention_mask,
+ inputs_embeds=flat_inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ encoder_layer = outputs[0]
+ pooled_output = self.pooler(encoder_layer)
+ pooled_output = self.dropout(pooled_output)
+ logits = self.classifier(pooled_output)
+ reshaped_logits = logits.view(-1, num_choices)
+
+ loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(reshaped_logits, labels)
+
+ if not return_dict:
+ output = (reshaped_logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return MultipleChoiceModelOutput(
+ loss=loss,
+ logits=reshaped_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = [
+ "DebertaV2ForMaskedLM",
+ "DebertaV2ForMultipleChoice",
+ "DebertaV2ForQuestionAnswering",
+ "DebertaV2ForSequenceClassification",
+ "DebertaV2ForTokenClassification",
+ "DebertaV2Model",
+ "DebertaV2PreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/docs/transformers/build/lib/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..899564eef05d3f8cf88a329d7b7d1f4a15893083
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -0,0 +1,1881 @@
+# coding=utf-8
+# Copyright 2021 Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 DeBERTa-v2 model."""
+
+from __future__ import annotations
+
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+ TFBaseModelOutput,
+ TFMaskedLMOutput,
+ TFMultipleChoiceModelOutput,
+ TFQuestionAnsweringModelOutput,
+ TFSequenceClassifierOutput,
+ TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+ TFMaskedLanguageModelingLoss,
+ TFModelInputType,
+ TFMultipleChoiceLoss,
+ TFPreTrainedModel,
+ TFQuestionAnsweringLoss,
+ TFSequenceClassificationLoss,
+ TFTokenClassificationLoss,
+ get_initializer,
+ keras,
+ unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_deberta_v2 import DebertaV2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DebertaV2Config"
+_CHECKPOINT_FOR_DOC = "kamalkraj/deberta-v2-xlarge"
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaContextPooler with Deberta->DebertaV2
+class TFDebertaV2ContextPooler(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
+ self.dropout = TFDebertaV2StableDropout(config.pooler_dropout, name="dropout")
+ self.config = config
+
+ def call(self, hidden_states, training: bool = False):
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ context_token = hidden_states[:, 0]
+ context_token = self.dropout(context_token, training=training)
+ pooled_output = self.dense(context_token)
+ pooled_output = get_tf_activation(self.config.pooler_hidden_act)(pooled_output)
+ return pooled_output
+
+ @property
+ def output_dim(self) -> int:
+ return self.config.hidden_size
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.pooler_hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaXSoftmax with Deberta->DebertaV2
+class TFDebertaV2XSoftmax(keras.layers.Layer):
+ """
+ Masked Softmax which is optimized for saving memory
+
+ Args:
+ input (`tf.Tensor`): The input tensor that will apply softmax.
+ mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+ dim (int): The dimension that will apply softmax
+ """
+
+ def __init__(self, axis=-1, **kwargs):
+ super().__init__(**kwargs)
+ self.axis = axis
+
+ def call(self, inputs: tf.Tensor, mask: tf.Tensor):
+ rmask = tf.logical_not(tf.cast(mask, tf.bool))
+ output = tf.where(rmask, tf.cast(float("-inf"), dtype=self.compute_dtype), inputs)
+ output = stable_softmax(tf.cast(output, dtype=tf.float32), self.axis)
+ output = tf.where(rmask, 0.0, output)
+ return output
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaStableDropout with Deberta->DebertaV2
+class TFDebertaV2StableDropout(keras.layers.Layer):
+ """
+ Optimized dropout module for stabilizing the training
+
+ Args:
+ drop_prob (float): the dropout probabilities
+ """
+
+ def __init__(self, drop_prob, **kwargs):
+ super().__init__(**kwargs)
+ self.drop_prob = drop_prob
+
+ @tf.custom_gradient
+ def xdropout(self, inputs):
+ """
+ Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
+ """
+ mask = tf.cast(
+ 1
+ - tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)),
+ tf.bool,
+ )
+ scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=self.compute_dtype)
+ if self.drop_prob > 0:
+ inputs = tf.where(mask, tf.cast(0.0, dtype=self.compute_dtype), inputs) * scale
+
+ def grad(upstream):
+ if self.drop_prob > 0:
+ return tf.where(mask, tf.cast(0.0, dtype=self.compute_dtype), upstream) * scale
+ else:
+ return upstream
+
+ return inputs, grad
+
+ def call(self, inputs: tf.Tensor, training: tf.Tensor = False):
+ if training:
+ return self.xdropout(inputs)
+ return inputs
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaSelfOutput with Deberta->DebertaV2
+class TFDebertaV2SelfOutput(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = keras.layers.Dense(config.hidden_size, name="dense")
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
+ self.config = config
+
+ def call(self, hidden_states, input_tensor, training: bool = False):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states, training=training)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, None, self.config.hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaAttention with Deberta->DebertaV2
+class TFDebertaV2Attention(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+ self.self = TFDebertaV2DisentangledSelfAttention(config, name="self")
+ self.dense_output = TFDebertaV2SelfOutput(config, name="output")
+ self.config = config
+
+ def call(
+ self,
+ input_tensor: tf.Tensor,
+ attention_mask: tf.Tensor,
+ query_states: Optional[tf.Tensor] = None,
+ relative_pos: Optional[tf.Tensor] = None,
+ rel_embeddings: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ self_outputs = self.self(
+ hidden_states=input_tensor,
+ attention_mask=attention_mask,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ if query_states is None:
+ query_states = input_tensor
+ attention_output = self.dense_output(
+ hidden_states=self_outputs[0], input_tensor=query_states, training=training
+ )
+
+ output = (attention_output,) + self_outputs[1:]
+
+ return output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "self", None) is not None:
+ with tf.name_scope(self.self.name):
+ self.self.build(None)
+ if getattr(self, "dense_output", None) is not None:
+ with tf.name_scope(self.dense_output.name):
+ self.dense_output.build(None)
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaIntermediate with Deberta->DebertaV2
+class TFDebertaV2Intermediate(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+ else:
+ self.intermediate_act_fn = config.hidden_act
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOutput with Deberta->DebertaV2
+class TFDebertaV2Output(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(hidden_states, training=training)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.intermediate_size])
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, None, self.config.hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLayer with Deberta->DebertaV2
+class TFDebertaV2Layer(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.attention = TFDebertaV2Attention(config, name="attention")
+ self.intermediate = TFDebertaV2Intermediate(config, name="intermediate")
+ self.bert_output = TFDebertaV2Output(config, name="output")
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ query_states: Optional[tf.Tensor] = None,
+ relative_pos: Optional[tf.Tensor] = None,
+ rel_embeddings: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ attention_outputs = self.attention(
+ input_tensor=hidden_states,
+ attention_mask=attention_mask,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ attention_output = attention_outputs[0]
+ intermediate_output = self.intermediate(hidden_states=attention_output)
+ layer_output = self.bert_output(
+ hidden_states=intermediate_output, input_tensor=attention_output, training=training
+ )
+ outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "intermediate", None) is not None:
+ with tf.name_scope(self.intermediate.name):
+ self.intermediate.build(None)
+ if getattr(self, "bert_output", None) is not None:
+ with tf.name_scope(self.bert_output.name):
+ self.bert_output.build(None)
+
+
+class TFDebertaV2ConvLayer(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.kernel_size = getattr(config, "conv_kernel_size", 3)
+ # groups = getattr(config, "conv_groups", 1)
+ self.conv_act = get_tf_activation(getattr(config, "conv_act", "tanh"))
+ self.padding = (self.kernel_size - 1) // 2
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
+ self.config = config
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ with tf.name_scope("conv"):
+ self.conv_kernel = self.add_weight(
+ name="kernel",
+ shape=[self.kernel_size, self.config.hidden_size, self.config.hidden_size],
+ initializer=get_initializer(self.config.initializer_range),
+ )
+ self.conv_bias = self.add_weight(
+ name="bias", shape=[self.config.hidden_size], initializer=tf.zeros_initializer()
+ )
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, None, self.config.hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+ def call(
+ self, hidden_states: tf.Tensor, residual_states: tf.Tensor, input_mask: tf.Tensor, training: bool = False
+ ) -> tf.Tensor:
+ out = tf.nn.conv2d(
+ tf.expand_dims(hidden_states, 1),
+ tf.expand_dims(self.conv_kernel, 0),
+ strides=1,
+ padding=[[0, 0], [0, 0], [self.padding, self.padding], [0, 0]],
+ )
+ out = tf.squeeze(tf.nn.bias_add(out, self.conv_bias), 1)
+ rmask = tf.cast(1 - input_mask, tf.bool)
+ out = tf.where(tf.broadcast_to(tf.expand_dims(rmask, -1), shape_list(out)), 0.0, out)
+ out = self.dropout(out, training=training)
+ out = self.conv_act(out)
+
+ layer_norm_input = residual_states + out
+ output = self.LayerNorm(layer_norm_input)
+
+ if input_mask is None:
+ output_states = output
+ else:
+ if len(shape_list(input_mask)) != len(shape_list(layer_norm_input)):
+ if len(shape_list(input_mask)) == 4:
+ input_mask = tf.squeeze(tf.squeeze(input_mask, axis=1), axis=1)
+ input_mask = tf.cast(tf.expand_dims(input_mask, axis=2), dtype=self.compute_dtype)
+
+ output_states = output * input_mask
+
+ return output_states
+
+
+class TFDebertaV2Encoder(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.layer = [TFDebertaV2Layer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+ self.relative_attention = getattr(config, "relative_attention", False)
+ self.config = config
+ if self.relative_attention:
+ self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+ if self.max_relative_positions < 1:
+ self.max_relative_positions = config.max_position_embeddings
+
+ self.position_buckets = getattr(config, "position_buckets", -1)
+ self.pos_ebd_size = self.max_relative_positions * 2
+
+ if self.position_buckets > 0:
+ self.pos_ebd_size = self.position_buckets * 2
+
+ self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
+
+ if "layer_norm" in self.norm_rel_ebd:
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+
+ self.conv = TFDebertaV2ConvLayer(config, name="conv") if getattr(config, "conv_kernel_size", 0) > 0 else None
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if self.relative_attention:
+ self.rel_embeddings = self.add_weight(
+ name="rel_embeddings.weight",
+ shape=[self.pos_ebd_size, self.config.hidden_size],
+ initializer=get_initializer(self.config.initializer_range),
+ )
+ if getattr(self, "conv", None) is not None:
+ with tf.name_scope(self.conv.name):
+ self.conv.build(None)
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, self.config.hidden_size])
+ if getattr(self, "layer", None) is not None:
+ for layer in self.layer:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+ def get_rel_embedding(self):
+ rel_embeddings = self.rel_embeddings if self.relative_attention else None
+ if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
+ rel_embeddings = self.LayerNorm(rel_embeddings)
+ return rel_embeddings
+
+ def get_attention_mask(self, attention_mask):
+ if len(shape_list(attention_mask)) <= 2:
+ extended_attention_mask = tf.expand_dims(tf.expand_dims(attention_mask, 1), 2)
+ attention_mask = extended_attention_mask * tf.expand_dims(tf.squeeze(extended_attention_mask, -2), -1)
+ attention_mask = tf.cast(attention_mask, tf.uint8)
+ elif len(shape_list(attention_mask)) == 3:
+ attention_mask = tf.expand_dims(attention_mask, 1)
+
+ return attention_mask
+
+ def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+ if self.relative_attention and relative_pos is None:
+ q = shape_list(query_states)[-2] if query_states is not None else shape_list(hidden_states)[-2]
+ relative_pos = build_relative_position(
+ q,
+ shape_list(hidden_states)[-2],
+ bucket_size=self.position_buckets,
+ max_position=self.max_relative_positions,
+ )
+ return relative_pos
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ query_states: Optional[tf.Tensor] = None,
+ relative_pos: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+ if len(shape_list(attention_mask)) <= 2:
+ input_mask = attention_mask
+ else:
+ input_mask = tf.cast(tf.math.reduce_sum(attention_mask, axis=-2) > 0, dtype=tf.uint8)
+
+ all_hidden_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ attention_mask = self.get_attention_mask(attention_mask)
+ relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+
+ next_kv = hidden_states
+
+ rel_embeddings = self.get_rel_embedding()
+ output_states = next_kv
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (output_states,)
+
+ layer_outputs = layer_module(
+ hidden_states=next_kv,
+ attention_mask=attention_mask,
+ query_states=query_states,
+ relative_pos=relative_pos,
+ rel_embeddings=rel_embeddings,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ output_states = layer_outputs[0]
+
+ if i == 0 and self.conv is not None:
+ output_states = self.conv(hidden_states, output_states, input_mask)
+
+ next_kv = output_states
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ # Add last layer
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (output_states,)
+
+ if not return_dict:
+ return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)
+
+ return TFBaseModelOutput(
+ last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
+ )
+
+
+def make_log_bucket_position(relative_pos, bucket_size, max_position):
+ sign = tf.math.sign(relative_pos)
+ mid = bucket_size // 2
+ abs_pos = tf.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, tf.math.abs(relative_pos))
+ log_pos = tf.math.ceil(
+ tf.cast(tf.math.log(abs_pos / mid), tf.float32)
+ / tf.cast(tf.math.log((max_position - 1) / mid), tf.float32)
+ * tf.cast(mid - 1, tf.float32) # in graph mode
+ ) + tf.cast(mid, tf.float32)
+ bucket_pos = tf.cast(
+ tf.where(abs_pos <= mid, tf.cast(relative_pos, tf.float32), log_pos * tf.cast(sign, tf.float32)), tf.int32
+ )
+ return bucket_pos
+
+
+def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1):
+ """
+ Build relative position according to the query and key
+
+ We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+ \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+ P_k\\)
+
+ Args:
+ query_size (int): the length of query
+ key_size (int): the length of key
+ bucket_size (int): the size of position bucket
+ max_position (int): the maximum allowed absolute position
+
+ Return:
+ `tf.Tensor`: A tensor with shape [1, query_size, key_size]
+
+ """
+ q_ids = tf.range(query_size, dtype=tf.int32)
+ k_ids = tf.range(key_size, dtype=tf.int32)
+ rel_pos_ids = q_ids[:, None] - tf.tile(tf.expand_dims(k_ids, axis=0), [shape_list(q_ids)[0], 1])
+ if bucket_size > 0 and max_position > 0:
+ rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
+ rel_pos_ids = rel_pos_ids[:query_size, :]
+ rel_pos_ids = tf.expand_dims(rel_pos_ids, axis=0)
+ return tf.cast(rel_pos_ids, tf.int64)
+
+
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+ shapes = [
+ shape_list(query_layer)[0],
+ shape_list(query_layer)[1],
+ shape_list(query_layer)[2],
+ shape_list(relative_pos)[-1],
+ ]
+ return tf.broadcast_to(c2p_pos, shapes)
+
+
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+ shapes = [
+ shape_list(query_layer)[0],
+ shape_list(query_layer)[1],
+ shape_list(key_layer)[-2],
+ shape_list(key_layer)[-2],
+ ]
+ return tf.broadcast_to(c2p_pos, shapes)
+
+
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+ shapes = shape_list(p2c_att)[:2] + [shape_list(pos_index)[-2], shape_list(key_layer)[-2]]
+ return tf.broadcast_to(pos_index, shapes)
+
+
+def take_along_axis(x, indices):
+ # Only a valid port of np.take_along_axis when the gather axis is -1
+
+ # TPU + gathers and reshapes don't go along well -- see https://github.com/huggingface/transformers/issues/18239
+ if isinstance(tf.distribute.get_strategy(), tf.distribute.TPUStrategy):
+ # [B, S, P] -> [B, S, P, D]
+ one_hot_indices = tf.one_hot(indices, depth=x.shape[-1], dtype=x.dtype)
+
+ # if we ignore the first two dims, this is equivalent to multiplying a matrix (one hot) by a vector (x)
+ # grossly abusing notation: [B, S, P, D] . [B, S, D] = [B, S, P]
+ gathered = tf.einsum("ijkl,ijl->ijk", one_hot_indices, x)
+
+ # GPUs, on the other hand, prefer gathers instead of large one-hot+matmuls
+ else:
+ gathered = tf.gather(x, indices, batch_dims=2)
+
+ return gathered
+
+
+class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer):
+ """
+ Disentangled self-attention module
+
+ Parameters:
+ config (`DebertaV2Config`):
+ A model config class instance with the configuration to build a new model. The schema is similar to
+ *BertConfig*, for more details, please refer [`DebertaV2Config`]
+
+ """
+
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+ if config.hidden_size % config.num_attention_heads != 0:
+ raise ValueError(
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+ f"heads ({config.num_attention_heads})"
+ )
+ self.num_attention_heads = config.num_attention_heads
+ _attention_head_size = config.hidden_size // config.num_attention_heads
+ self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.query_proj = keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="query_proj",
+ use_bias=True,
+ )
+ self.key_proj = keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="key_proj",
+ use_bias=True,
+ )
+ self.value_proj = keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="value_proj",
+ use_bias=True,
+ )
+
+ self.share_att_key = getattr(config, "share_att_key", False)
+ self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+ self.relative_attention = getattr(config, "relative_attention", False)
+
+ if self.relative_attention:
+ self.position_buckets = getattr(config, "position_buckets", -1)
+ self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+ if self.max_relative_positions < 1:
+ self.max_relative_positions = config.max_position_embeddings
+ self.pos_ebd_size = self.max_relative_positions
+ if self.position_buckets > 0:
+ self.pos_ebd_size = self.position_buckets
+
+ self.pos_dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="pos_dropout")
+
+ if not self.share_att_key:
+ if "c2p" in self.pos_att_type:
+ self.pos_key_proj = keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="pos_proj",
+ use_bias=True,
+ )
+ if "p2c" in self.pos_att_type:
+ self.pos_query_proj = keras.layers.Dense(
+ self.all_head_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="pos_q_proj",
+ )
+ self.softmax = TFDebertaV2XSoftmax(axis=-1)
+ self.dropout = TFDebertaV2StableDropout(config.attention_probs_dropout_prob, name="dropout")
+ self.config = config
+
+ def transpose_for_scores(self, tensor: tf.Tensor, attention_heads: int) -> tf.Tensor:
+ tensor_shape = shape_list(tensor)
+ # In graph mode mode, we can't reshape with -1 as the final dimension if the first dimension (batch size) is None
+ shape = tensor_shape[:-1] + [attention_heads, tensor_shape[-1] // attention_heads]
+ # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+ tensor = tf.reshape(tensor=tensor, shape=shape)
+ tensor = tf.transpose(tensor, perm=[0, 2, 1, 3])
+ x_shape = shape_list(tensor)
+ tensor = tf.reshape(tensor, shape=[-1, x_shape[-2], x_shape[-1]])
+ return tensor
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ query_states: Optional[tf.Tensor] = None,
+ relative_pos: Optional[tf.Tensor] = None,
+ rel_embeddings: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ """
+ Call the module
+
+ Args:
+ hidden_states (`tf.Tensor`):
+ Input states to the module usually the output from previous layer, it will be the Q,K and V in
+ *Attention(Q,K,V)*
+
+ attention_mask (`tf.Tensor`):
+ An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+ sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
+ th token.
+
+ return_att (`bool`, *optional*):
+ Whether return the attention matrix.
+
+ query_states (`tf.Tensor`, *optional*):
+ The *Q* state in *Attention(Q,K,V)*.
+
+ relative_pos (`tf.Tensor`):
+ The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+ values ranging in [*-max_relative_positions*, *max_relative_positions*].
+
+ rel_embeddings (`tf.Tensor`):
+ The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+ \\text{max_relative_positions}\\), *hidden_size*].
+
+
+ """
+ if query_states is None:
+ query_states = hidden_states
+ query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads)
+ key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads)
+ value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads)
+
+ rel_att = None
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ scale_factor = 1
+ if "c2p" in self.pos_att_type:
+ scale_factor += 1
+ if "p2c" in self.pos_att_type:
+ scale_factor += 1
+ scale = tf.math.sqrt(tf.cast(shape_list(query_layer)[-1] * scale_factor, dtype=self.compute_dtype))
+ attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 2, 1]) / scale)
+ if self.relative_attention:
+ rel_embeddings = self.pos_dropout(rel_embeddings)
+ rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
+
+ if rel_att is not None:
+ attention_scores = attention_scores + rel_att
+ attention_scores = tf.reshape(
+ attention_scores,
+ (-1, self.num_attention_heads, shape_list(attention_scores)[-2], shape_list(attention_scores)[-1]),
+ )
+
+ # bsz x height x length x dimension
+ attention_probs = self.softmax(attention_scores, attention_mask)
+ attention_probs = self.dropout(attention_probs, training=training)
+ context_layer = tf.matmul(
+ tf.reshape(attention_probs, [-1, shape_list(attention_probs)[-2], shape_list(attention_probs)[-1]]),
+ value_layer,
+ )
+ context_layer = tf.transpose(
+ tf.reshape(
+ context_layer,
+ [-1, self.num_attention_heads, shape_list(context_layer)[-2], shape_list(context_layer)[-1]],
+ ),
+ [0, 2, 1, 3],
+ )
+ # Set the final dimension here explicitly.
+ # Calling tf.reshape(context_layer, (*context_layer_shape[:-2], -1)) raises an error when executing
+ # the model in graph mode as context_layer is reshaped to (None, 7, None) and Dense layer in TFDebertaV2SelfOutput
+ # requires final input dimension to be defined
+ context_layer_shape = shape_list(context_layer)
+ new_context_layer_shape = context_layer_shape[:-2] + [context_layer_shape[-2] * context_layer_shape[-1]]
+ context_layer = tf.reshape(context_layer, new_context_layer_shape)
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+ return outputs
+
+ def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
+ if relative_pos is None:
+ q = shape_list(query_layer)[-2]
+ relative_pos = build_relative_position(
+ q,
+ shape_list(key_layer)[-2],
+ bucket_size=self.position_buckets,
+ max_position=self.max_relative_positions,
+ )
+ shape_list_pos = shape_list(relative_pos)
+ if len(shape_list_pos) == 2:
+ relative_pos = tf.expand_dims(tf.expand_dims(relative_pos, 0), 0)
+ elif len(shape_list_pos) == 3:
+ relative_pos = tf.expand_dims(relative_pos, 1)
+ # bsz x height x query x key
+ elif len(shape_list_pos) != 4:
+ raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {len(shape_list_pos)}")
+
+ att_span = self.pos_ebd_size
+ rel_embeddings = tf.expand_dims(
+ rel_embeddings[self.pos_ebd_size - att_span : self.pos_ebd_size + att_span, :], 0
+ )
+ if self.share_att_key:
+ pos_query_layer = tf.tile(
+ self.transpose_for_scores(self.query_proj(rel_embeddings), self.num_attention_heads),
+ [shape_list(query_layer)[0] // self.num_attention_heads, 1, 1],
+ )
+ pos_key_layer = tf.tile(
+ self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads),
+ [shape_list(query_layer)[0] // self.num_attention_heads, 1, 1],
+ )
+ else:
+ if "c2p" in self.pos_att_type:
+ pos_key_layer = tf.tile(
+ self.transpose_for_scores(self.pos_key_proj(rel_embeddings), self.num_attention_heads),
+ [shape_list(query_layer)[0] // self.num_attention_heads, 1, 1],
+ ) # .split(self.all_head_size, dim=-1)
+ if "p2c" in self.pos_att_type:
+ pos_query_layer = tf.tile(
+ self.transpose_for_scores(self.pos_query_proj(rel_embeddings), self.num_attention_heads),
+ [shape_list(query_layer)[0] // self.num_attention_heads, 1, 1],
+ ) # .split(self.all_head_size, dim=-1)
+
+ score = 0
+ # content->position
+ if "c2p" in self.pos_att_type:
+ scale = tf.math.sqrt(tf.cast(shape_list(pos_key_layer)[-1] * scale_factor, dtype=self.compute_dtype))
+ c2p_att = tf.matmul(query_layer, tf.transpose(pos_key_layer, [0, 2, 1]))
+ c2p_pos = tf.clip_by_value(relative_pos + att_span, 0, att_span * 2 - 1)
+ c2p_att = take_along_axis(
+ c2p_att,
+ tf.broadcast_to(
+ tf.squeeze(c2p_pos, 0),
+ [shape_list(query_layer)[0], shape_list(query_layer)[1], shape_list(relative_pos)[-1]],
+ ),
+ )
+ score += c2p_att / scale
+
+ # position->content
+ if "p2c" in self.pos_att_type:
+ scale = tf.math.sqrt(tf.cast(shape_list(pos_query_layer)[-1] * scale_factor, dtype=self.compute_dtype))
+ if shape_list(key_layer)[-2] != shape_list(query_layer)[-2]:
+ r_pos = build_relative_position(
+ shape_list(key_layer)[-2],
+ shape_list(key_layer)[-2],
+ bucket_size=self.position_buckets,
+ max_position=self.max_relative_positions,
+ )
+ r_pos = tf.expand_dims(r_pos, 0)
+ else:
+ r_pos = relative_pos
+
+ p2c_pos = tf.clip_by_value(-r_pos + att_span, 0, att_span * 2 - 1)
+
+ p2c_att = tf.matmul(key_layer, tf.transpose(pos_query_layer, [0, 2, 1]))
+ p2c_att = tf.transpose(
+ take_along_axis(
+ p2c_att,
+ tf.broadcast_to(
+ tf.squeeze(p2c_pos, 0),
+ [shape_list(query_layer)[0], shape_list(key_layer)[-2], shape_list(key_layer)[-2]],
+ ),
+ ),
+ [0, 2, 1],
+ )
+ score += p2c_att / scale
+
+ return score
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "query_proj", None) is not None:
+ with tf.name_scope(self.query_proj.name):
+ self.query_proj.build([None, None, self.config.hidden_size])
+ if getattr(self, "key_proj", None) is not None:
+ with tf.name_scope(self.key_proj.name):
+ self.key_proj.build([None, None, self.config.hidden_size])
+ if getattr(self, "value_proj", None) is not None:
+ with tf.name_scope(self.value_proj.name):
+ self.value_proj.build([None, None, self.config.hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+ if getattr(self, "pos_dropout", None) is not None:
+ with tf.name_scope(self.pos_dropout.name):
+ self.pos_dropout.build(None)
+ if getattr(self, "pos_key_proj", None) is not None:
+ with tf.name_scope(self.pos_key_proj.name):
+ self.pos_key_proj.build([None, None, self.config.hidden_size])
+ if getattr(self, "pos_query_proj", None) is not None:
+ with tf.name_scope(self.pos_query_proj.name):
+ self.pos_query_proj.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaEmbeddings Deberta->DebertaV2
+class TFDebertaV2Embeddings(keras.layers.Layer):
+ """Construct the embeddings from word, position and token_type embeddings."""
+
+ def __init__(self, config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.config = config
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+ self.hidden_size = config.hidden_size
+ self.max_position_embeddings = config.max_position_embeddings
+ self.position_biased_input = getattr(config, "position_biased_input", True)
+ self.initializer_range = config.initializer_range
+ if self.embedding_size != config.hidden_size:
+ self.embed_proj = keras.layers.Dense(
+ config.hidden_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="embed_proj",
+ use_bias=False,
+ )
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
+
+ def build(self, input_shape=None):
+ with tf.name_scope("word_embeddings"):
+ self.weight = self.add_weight(
+ name="weight",
+ shape=[self.config.vocab_size, self.embedding_size],
+ initializer=get_initializer(self.initializer_range),
+ )
+
+ with tf.name_scope("token_type_embeddings"):
+ if self.config.type_vocab_size > 0:
+ self.token_type_embeddings = self.add_weight(
+ name="embeddings",
+ shape=[self.config.type_vocab_size, self.embedding_size],
+ initializer=get_initializer(self.initializer_range),
+ )
+ else:
+ self.token_type_embeddings = None
+
+ with tf.name_scope("position_embeddings"):
+ if self.position_biased_input:
+ self.position_embeddings = self.add_weight(
+ name="embeddings",
+ shape=[self.max_position_embeddings, self.hidden_size],
+ initializer=get_initializer(self.initializer_range),
+ )
+ else:
+ self.position_embeddings = None
+
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, None, self.config.hidden_size])
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+ if getattr(self, "embed_proj", None) is not None:
+ with tf.name_scope(self.embed_proj.name):
+ self.embed_proj.build([None, None, self.embedding_size])
+
+ def call(
+ self,
+ input_ids: Optional[tf.Tensor] = None,
+ position_ids: Optional[tf.Tensor] = None,
+ token_type_ids: Optional[tf.Tensor] = None,
+ inputs_embeds: Optional[tf.Tensor] = None,
+ mask: Optional[tf.Tensor] = None,
+ training: bool = False,
+ ) -> tf.Tensor:
+ """
+ Applies embedding based on inputs tensor.
+
+ Returns:
+ final_embeddings (`tf.Tensor`): output embedding tensor.
+ """
+ if input_ids is None and inputs_embeds is None:
+ raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
+
+ if input_ids is not None:
+ check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+ inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+ input_shape = shape_list(inputs_embeds)[:-1]
+
+ if token_type_ids is None:
+ token_type_ids = tf.fill(dims=input_shape, value=0)
+
+ if position_ids is None:
+ position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+ final_embeddings = inputs_embeds
+ if self.position_biased_input:
+ position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+ final_embeddings += position_embeds
+ if self.config.type_vocab_size > 0:
+ token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+ final_embeddings += token_type_embeds
+
+ if self.embedding_size != self.hidden_size:
+ final_embeddings = self.embed_proj(final_embeddings)
+
+ final_embeddings = self.LayerNorm(final_embeddings)
+
+ if mask is not None:
+ if len(shape_list(mask)) != len(shape_list(final_embeddings)):
+ if len(shape_list(mask)) == 4:
+ mask = tf.squeeze(tf.squeeze(mask, axis=1), axis=1)
+ mask = tf.cast(tf.expand_dims(mask, axis=2), dtype=self.compute_dtype)
+
+ final_embeddings = final_embeddings * mask
+
+ final_embeddings = self.dropout(final_embeddings, training=training)
+
+ return final_embeddings
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaPredictionHeadTransform with Deberta->DebertaV2
+class TFDebertaV2PredictionHeadTransform(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+ self.dense = keras.layers.Dense(
+ units=self.embedding_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="dense",
+ )
+
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = get_tf_activation(config.hidden_act)
+ else:
+ self.transform_act_fn = config.hidden_act
+ self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+ if getattr(self, "LayerNorm", None) is not None:
+ with tf.name_scope(self.LayerNorm.name):
+ self.LayerNorm.build([None, None, self.embedding_size])
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLMPredictionHead with Deberta->DebertaV2
+class TFDebertaV2LMPredictionHead(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
+ super().__init__(**kwargs)
+
+ self.config = config
+ self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+ self.transform = TFDebertaV2PredictionHeadTransform(config, name="transform")
+
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ self.input_embeddings = input_embeddings
+
+ def build(self, input_shape=None):
+ self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "transform", None) is not None:
+ with tf.name_scope(self.transform.name):
+ self.transform.build(None)
+
+ def get_output_embeddings(self) -> keras.layers.Layer:
+ return self.input_embeddings
+
+ def set_output_embeddings(self, value: tf.Variable):
+ self.input_embeddings.weight = value
+ self.input_embeddings.vocab_size = shape_list(value)[0]
+
+ def get_bias(self) -> Dict[str, tf.Variable]:
+ return {"bias": self.bias}
+
+ def set_bias(self, value: tf.Variable):
+ self.bias = value["bias"]
+ self.config.vocab_size = shape_list(value["bias"])[0]
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.transform(hidden_states=hidden_states)
+ seq_length = shape_list(hidden_states)[1]
+ hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
+ hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
+ hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+ hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+ return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOnlyMLMHead with Deberta->DebertaV2
+class TFDebertaV2OnlyMLMHead(keras.layers.Layer):
+ def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
+ super().__init__(**kwargs)
+ self.predictions = TFDebertaV2LMPredictionHead(config, input_embeddings, name="predictions")
+
+ def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
+ prediction_scores = self.predictions(hidden_states=sequence_output)
+
+ return prediction_scores
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "predictions", None) is not None:
+ with tf.name_scope(self.predictions.name):
+ self.predictions.build(None)
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaMainLayer with Deberta->DebertaV2
+class TFDebertaV2MainLayer(keras.layers.Layer):
+ config_class = DebertaV2Config
+
+ def __init__(self, config: DebertaV2Config, **kwargs):
+ super().__init__(**kwargs)
+
+ self.config = config
+
+ self.embeddings = TFDebertaV2Embeddings(config, name="embeddings")
+ self.encoder = TFDebertaV2Encoder(config, name="encoder")
+
+ def get_input_embeddings(self) -> keras.layers.Layer:
+ return self.embeddings
+
+ def set_input_embeddings(self, value: tf.Variable):
+ self.embeddings.weight = value
+ self.embeddings.vocab_size = shape_list(value)[0]
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ raise NotImplementedError
+
+ @unpack_inputs
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ input_shape = shape_list(input_ids)
+ elif inputs_embeds is not None:
+ input_shape = shape_list(inputs_embeds)[:-1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if attention_mask is None:
+ attention_mask = tf.fill(dims=input_shape, value=1)
+
+ if token_type_ids is None:
+ token_type_ids = tf.fill(dims=input_shape, value=0)
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ position_ids=position_ids,
+ token_type_ids=token_type_ids,
+ inputs_embeds=inputs_embeds,
+ mask=attention_mask,
+ training=training,
+ )
+
+ encoder_outputs = self.encoder(
+ hidden_states=embedding_output,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = encoder_outputs[0]
+
+ if not return_dict:
+ return (sequence_output,) + encoder_outputs[1:]
+
+ return TFBaseModelOutput(
+ last_hidden_state=sequence_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "embeddings", None) is not None:
+ with tf.name_scope(self.embeddings.name):
+ self.embeddings.build(None)
+ if getattr(self, "encoder", None) is not None:
+ with tf.name_scope(self.encoder.name):
+ self.encoder.build(None)
+
+
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaPreTrainedModel with Deberta->DebertaV2
+class TFDebertaV2PreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = DebertaV2Config
+ base_model_prefix = "deberta"
+
+
+DEBERTA_START_DOCSTRING = r"""
+ The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+ Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+ on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+ improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+ This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+ as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+ behavior.
+
+
+
+ TensorFlow models and layers in `transformers` accept two formats as input:
+
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional argument.
+
+ The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+ and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+ pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+ format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+ the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+ positional argument:
+
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+ Note that when creating models and layers with
+ [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+ about any of this, as you can just pass inputs like you would to any other Python function!
+
+
+
+ Parameters:
+ config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEBERTA_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
+
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
+
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput``] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+ DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaModel with Deberta->DebertaV2
+class TFDebertaV2Model(TFDebertaV2PreTrainedModel):
+ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.deberta = TFDebertaV2MainLayer(config, name="deberta")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFBaseModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+
+
+@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaForMaskedLM with Deberta->DebertaV2
+class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelingLoss):
+ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ if config.is_decoder:
+ logger.warning(
+ "If you want to use `TFDebertaV2ForMaskedLM` make sure `config.is_decoder=False` for "
+ "bi-directional self-attention."
+ )
+
+ self.deberta = TFDebertaV2MainLayer(config, name="deberta")
+ self.mlm = TFDebertaV2OnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")
+
+ def get_lm_head(self) -> keras.layers.Layer:
+ return self.mlm.predictions
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFMaskedLMOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ """
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = outputs[0]
+ prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFMaskedLMOutput(
+ loss=loss,
+ logits=prediction_scores,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+ if getattr(self, "mlm", None) is not None:
+ with tf.name_scope(self.mlm.name):
+ self.mlm.build(None)
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+ pooled output) e.g. for GLUE tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaForSequenceClassification with Deberta->DebertaV2
+class TFDebertaV2ForSequenceClassification(TFDebertaV2PreTrainedModel, TFSequenceClassificationLoss):
+ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.num_labels = config.num_labels
+
+ self.deberta = TFDebertaV2MainLayer(config, name="deberta")
+ self.pooler = TFDebertaV2ContextPooler(config, name="pooler")
+
+ drop_out = getattr(config, "cls_dropout", None)
+ drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+ self.dropout = TFDebertaV2StableDropout(drop_out, name="cls_dropout")
+ self.classifier = keras.layers.Dense(
+ units=config.num_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="classifier",
+ )
+ self.output_dim = self.pooler.output_dim
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFSequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = outputs[0]
+ pooled_output = self.pooler(sequence_output, training=training)
+ pooled_output = self.dropout(pooled_output, training=training)
+ logits = self.classifier(pooled_output)
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+
+ return ((loss,) + output) if loss is not None else output
+
+ return TFSequenceClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+ if getattr(self, "pooler", None) is not None:
+ with tf.name_scope(self.pooler.name):
+ self.pooler.build(None)
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.output_dim])
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+ Named-Entity-Recognition (NER) tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaForTokenClassification with Deberta->DebertaV2
+class TFDebertaV2ForTokenClassification(TFDebertaV2PreTrainedModel, TFTokenClassificationLoss):
+ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.num_labels = config.num_labels
+
+ self.deberta = TFDebertaV2MainLayer(config, name="deberta")
+ self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+ self.classifier = keras.layers.Dense(
+ units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+ )
+ self.config = config
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFTokenClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+ """
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = outputs[0]
+ sequence_output = self.dropout(sequence_output, training=training)
+ logits = self.classifier(inputs=sequence_output)
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFTokenClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+ layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaForQuestionAnswering with Deberta->DebertaV2
+class TFDebertaV2ForQuestionAnswering(TFDebertaV2PreTrainedModel, TFQuestionAnsweringLoss):
+ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.num_labels = config.num_labels
+
+ self.deberta = TFDebertaV2MainLayer(config, name="deberta")
+ self.qa_outputs = keras.layers.Dense(
+ units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+ )
+ self.config = config
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFQuestionAnsweringModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ start_positions: np.ndarray | tf.Tensor | None = None,
+ end_positions: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
+ r"""
+ start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ """
+ outputs = self.deberta(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = outputs[0]
+ logits = self.qa_outputs(inputs=sequence_output)
+ start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
+ start_logits = tf.squeeze(input=start_logits, axis=-1)
+ end_logits = tf.squeeze(input=end_logits, axis=-1)
+ loss = None
+
+ if start_positions is not None and end_positions is not None:
+ labels = {"start_position": start_positions}
+ labels["end_position"] = end_positions
+ loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
+
+ if not return_dict:
+ output = (start_logits, end_logits) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFQuestionAnsweringModelOutput(
+ loss=loss,
+ start_logits=start_logits,
+ end_logits=end_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+ if getattr(self, "qa_outputs", None) is not None:
+ with tf.name_scope(self.qa_outputs.name):
+ self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+ """
+ DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+ softmax) e.g. for RocStories/SWAG tasks.
+ """,
+ DEBERTA_START_DOCSTRING,
+)
+class TFDebertaV2ForMultipleChoice(TFDebertaV2PreTrainedModel, TFMultipleChoiceLoss):
+ # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+ # _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
+ # _keys_to_ignore_on_load_missing = [r"dropout"]
+
+ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.deberta = TFDebertaV2MainLayer(config, name="deberta")
+ self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+ self.pooler = TFDebertaV2ContextPooler(config, name="pooler")
+ self.classifier = keras.layers.Dense(
+ units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+ )
+ self.output_dim = self.pooler.output_dim
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFMultipleChoiceModelOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: TFModelInputType | None = None,
+ attention_mask: np.ndarray | tf.Tensor | None = None,
+ token_type_ids: np.ndarray | tf.Tensor | None = None,
+ position_ids: np.ndarray | tf.Tensor | None = None,
+ inputs_embeds: np.ndarray | tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: np.ndarray | tf.Tensor | None = None,
+ training: Optional[bool] = False,
+ ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
+ r"""
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+ where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+ """
+ if input_ids is not None:
+ num_choices = shape_list(input_ids)[1]
+ seq_length = shape_list(input_ids)[2]
+ else:
+ num_choices = shape_list(inputs_embeds)[1]
+ seq_length = shape_list(inputs_embeds)[2]
+
+ flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
+ flat_attention_mask = (
+ tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
+ )
+ flat_token_type_ids = (
+ tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
+ )
+ flat_position_ids = (
+ tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None
+ )
+ flat_inputs_embeds = (
+ tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
+ if inputs_embeds is not None
+ else None
+ )
+ outputs = self.deberta(
+ input_ids=flat_input_ids,
+ attention_mask=flat_attention_mask,
+ token_type_ids=flat_token_type_ids,
+ position_ids=flat_position_ids,
+ inputs_embeds=flat_inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = outputs[0]
+ pooled_output = self.pooler(sequence_output, training=training)
+ pooled_output = self.dropout(pooled_output, training=training)
+ logits = self.classifier(pooled_output)
+ reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
+ loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)
+
+ if not return_dict:
+ output = (reshaped_logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFMultipleChoiceModelOutput(
+ loss=loss,
+ logits=reshaped_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deberta", None) is not None:
+ with tf.name_scope(self.deberta.name):
+ self.deberta.build(None)
+ if getattr(self, "pooler", None) is not None:
+ with tf.name_scope(self.pooler.name):
+ self.pooler.build(None)
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.output_dim])
+
+
+__all__ = [
+ "TFDebertaV2ForMaskedLM",
+ "TFDebertaV2ForQuestionAnswering",
+ "TFDebertaV2ForMultipleChoice",
+ "TFDebertaV2ForSequenceClassification",
+ "TFDebertaV2ForTokenClassification",
+ "TFDebertaV2Model",
+ "TFDebertaV2PreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/deberta_v2/tokenization_deberta_v2.py b/docs/transformers/build/lib/transformers/models/deberta_v2/tokenization_deberta_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b23a962d420cc241ee970e830accca4716d0c2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -0,0 +1,526 @@
+# coding=utf-8
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for model DeBERTa."""
+
+import os
+import unicodedata
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as sp
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
+
+
+@requires(backends=("sentencepiece",))
+class DebertaV2Tokenizer(PreTrainedTokenizer):
+ r"""
+ Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+ Args:
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+ contains the vocabulary necessary to instantiate a tokenizer.
+ do_lower_case (`bool`, *optional*, defaults to `False`):
+ Whether or not to lowercase the input when tokenizing.
+ bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+ The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+ eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+ The end of sequence token. When building a sequence using special tokens, this is not the token that is
+ used for the end of sequence. The token used is the `sep_token`.
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+ sequence classification or for a text and a question for question answering. It is also used as the last
+ token of a sequence built with special tokens.
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+ The token used for padding, for example when batching sequences of different lengths.
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+ The token used for masking values. This is the token used when training this model with masked language
+ modeling. This is the token which the model will try to predict.
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+ to set:
+
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ using forward-filtering-and-backward-sampling algorithm.
+
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ BPE-dropout.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+
+ def __init__(
+ self,
+ vocab_file,
+ do_lower_case=False,
+ split_by_punct=False,
+ bos_token="[CLS]",
+ eos_token="[SEP]",
+ unk_token="[UNK]",
+ sep_token="[SEP]",
+ pad_token="[PAD]",
+ cls_token="[CLS]",
+ mask_token="[MASK]",
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ **kwargs,
+ ) -> None:
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+ if not os.path.isfile(vocab_file):
+ raise ValueError(
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+ " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+ )
+ self.do_lower_case = do_lower_case
+ self.split_by_punct = split_by_punct
+ self.vocab_file = vocab_file
+ self._tokenizer = SPMTokenizer(
+ vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
+ )
+ unk_token = AddedToken(unk_token, normalized=True, special=True) if isinstance(unk_token, str) else unk_token
+ super().__init__(
+ do_lower_case=do_lower_case,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ split_by_punct=split_by_punct,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+ self._tokenizer.special_tokens = self.all_special_tokens
+
+ @property
+ def vocab_size(self):
+ return len(self.vocab)
+
+ @property
+ def vocab(self):
+ return self._tokenizer.vocab
+
+ def get_vocab(self):
+ vocab = self.vocab.copy()
+ vocab.update(self.get_added_vocab())
+ return vocab
+
+ def _tokenize(self, text: str) -> List[str]:
+ """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+ if self.do_lower_case:
+ text = text.lower()
+ return self._tokenizer.tokenize(text)
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self._tokenizer.spm.PieceToId(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ return self._tokenizer.decode(tokens)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ """
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+ adding special tokens. A DeBERTa sequence has the following format:
+
+ - single sequence: [CLS] X [SEP]
+ - pair of sequences: [CLS] A [SEP] B [SEP]
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs to which the special tokens will be added.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+ """
+
+ if token_ids_1 is None:
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+ cls = [self.cls_token_id]
+ sep = [self.sep_token_id]
+ return cls + token_ids_0 + sep + token_ids_1 + sep
+
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+ """
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is not None:
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+ return [1] + ([0] * len(token_ids_0)) + [1]
+
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+ sequence pair mask has the following format:
+
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
+
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+ """
+ sep = [self.sep_token_id]
+ cls = [self.cls_token_id]
+ if token_ids_1 is None:
+ return len(cls + token_ids_0 + sep) * [0]
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+ add_prefix_space = kwargs.pop("add_prefix_space", False)
+ if is_split_into_words or add_prefix_space:
+ text = " " + text
+ return (text, kwargs)
+
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)
+
+
+class SPMTokenizer:
+ r"""
+ Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
+
+ Args:
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+ contains the vocabulary necessary to instantiate a tokenizer.
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+ to set:
+
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ using forward-filtering-and-backward-sampling algorithm.
+
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ BPE-dropout.
+ """
+
+ def __init__(
+ self, vocab_file, special_tokens, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]] = None
+ ):
+ self.split_by_punct = split_by_punct
+ self.vocab_file = vocab_file
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+ if not os.path.exists(vocab_file):
+ raise FileNotFoundError(f"{vocab_file} does not exist!")
+ spm.load(vocab_file)
+ bpe_vocab_size = spm.GetPieceSize()
+ # Token map
+ # 0+1
+ # 1+1
+ # 2+1
+ self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
+ self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
+ # self.vocab['[PAD]'] = 0
+ # self.vocab['[CLS]'] = 1
+ # self.vocab['[SEP]'] = 2
+ # self.vocab['[UNK]'] = 3
+
+ self.spm = spm
+ self.special_tokens = special_tokens
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["spm"] = None
+ return state
+
+ def __setstate__(self, d):
+ self.__dict__ = d
+
+ # for backward compatibility
+ if not hasattr(self, "sp_model_kwargs"):
+ self.sp_model_kwargs = {}
+
+ self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.spm.Load(self.vocab_file)
+
+ def tokenize(self, text):
+ return self._encode_as_pieces(text)
+
+ def convert_ids_to_tokens(self, ids):
+ tokens = []
+ for i in ids:
+ tokens.append(self.ids_to_tokens[i])
+ return tokens
+
+ def decode(self, tokens, start=-1, end=-1, raw_text=None):
+ if raw_text is None:
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for token in tokens:
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.special_tokens:
+ if not prev_is_special:
+ out_string += " "
+ out_string += self.spm.decode_pieces(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.spm.decode_pieces(current_sub_tokens)
+ return out_string.strip()
+ else:
+ words = self.split_to_words(raw_text)
+ word_tokens = [self.tokenize(w) for w in words]
+ token2words = [0] * len(tokens)
+ tid = 0
+ for i, w in enumerate(word_tokens):
+ for k, t in enumerate(w):
+ token2words[tid] = i
+ tid += 1
+ word_start = token2words[start]
+ word_end = token2words[end] if end < len(tokens) else len(words)
+ text = "".join(words[word_start:word_end])
+ return text
+
+ # TODO add a deprecation cycle as this can have different behaviour from our API
+ def add_special_token(self, token):
+ if token not in self.special_tokens:
+ self.special_tokens.append(token)
+ if token not in self.vocab:
+ self.vocab[token] = len(self.vocab) - 1
+ self.ids_to_tokens.append(token)
+ return self.id(token)
+
+ def part_of_whole_word(self, token, is_bos=False):
+ logger.warning_once(
+ "The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
+ )
+ if is_bos:
+ return True
+ if (
+ len(token) == 1
+ and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))
+ ) or token in self.special_tokens:
+ return False
+
+ word_start = b"\xe2\x96\x81".decode("utf-8")
+ return not token.startswith(word_start)
+
+ def pad(self):
+ return "[PAD]"
+
+ def bos(self):
+ return "[CLS]"
+
+ def eos(self):
+ return "[SEP]"
+
+ def unk(self):
+ return "[UNK]"
+
+ def mask(self):
+ return "[MASK]"
+
+ def sym(self, id):
+ return self.ids_to_tokens[id]
+
+ def id(self, sym):
+ logger.warning_once(
+ "The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
+ )
+ return self.vocab[sym] if sym in self.vocab else 1
+
+ def _encode_as_pieces(self, text):
+ text = convert_to_unicode(text)
+ if self.split_by_punct:
+ words = self._run_split_on_punc(text)
+ pieces = [self.spm.encode(w, out_type=str) for w in words]
+ return [p for w in pieces for p in w]
+ else:
+ return self.spm.encode(text, out_type=str)
+
+ def split_to_words(self, text):
+ pieces = self._encode_as_pieces(text)
+ word_start = b"\xe2\x96\x81".decode("utf-8")
+ words = []
+ offset = 0
+ prev_end = 0
+ for i, p in enumerate(pieces):
+ if p.startswith(word_start):
+ if offset > prev_end:
+ words.append(text[prev_end:offset])
+ prev_end = offset
+ w = p.replace(word_start, "")
+ else:
+ w = p
+ try:
+ s = text.index(w, offset)
+ pn = ""
+ k = i + 1
+ while k < len(pieces):
+ pn = pieces[k].replace(word_start, "")
+ if len(pn) > 0:
+ break
+ k += 1
+
+ if len(pn) > 0 and pn in text[offset:s]:
+ offset = offset + 1
+ else:
+ offset = s + len(w)
+ except Exception:
+ offset = offset + 1
+
+ if prev_end < offset:
+ words.append(text[prev_end:offset])
+
+ return words
+
+ def _run_split_on_punc(self, text):
+ """Splits punctuation on a piece of text."""
+ chars = list(text)
+ i = 0
+ start_new_word = True
+ output = []
+ while i < len(chars):
+ char = chars[i]
+ if _is_punctuation(char):
+ output.append([char])
+ start_new_word = True
+ else:
+ if start_new_word:
+ output.append([])
+ start_new_word = False
+ output[-1].append(char)
+ i += 1
+
+ return ["".join(x) for x in output]
+
+ def save_pretrained(self, path: str, filename_prefix: Optional[str] = None):
+ filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
+ if filename_prefix is not None:
+ filename = filename_prefix + "-" + filename
+ full_path = os.path.join(path, filename)
+ with open(full_path, "wb") as fs:
+ fs.write(self.spm.serialized_model_proto())
+ return (full_path,)
+
+
+def _is_whitespace(char):
+ """Checks whether `chars` is a whitespace character."""
+ # \t, \n, and \r are technically control characters but we treat them
+ # as whitespace since they are generally considered as such.
+ if char == " " or char == "\t" or char == "\n" or char == "\r":
+ return True
+ cat = unicodedata.category(char)
+ if cat == "Zs":
+ return True
+ return False
+
+
+def _is_control(char):
+ """Checks whether `chars` is a control character."""
+ # These are technically control characters but we count them as whitespace
+ # characters.
+ if char == "\t" or char == "\n" or char == "\r":
+ return False
+ cat = unicodedata.category(char)
+ if cat.startswith("C"):
+ return True
+ return False
+
+
+def _is_punctuation(char):
+ """Checks whether `chars` is a punctuation character."""
+ cp = ord(char)
+ # We treat all non-letter/number ASCII as punctuation.
+ # Characters such as "^", "$", and "`" are not in the Unicode
+ # Punctuation class but we treat them as punctuation anyways, for
+ # consistency.
+ if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+ return True
+ cat = unicodedata.category(char)
+ if cat.startswith("P"):
+ return True
+ return False
+
+
+def convert_to_unicode(text):
+ """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+ if isinstance(text, str):
+ return text
+ elif isinstance(text, bytes):
+ return text.decode("utf-8", "ignore")
+ else:
+ raise TypeError(f"Unsupported string type: {type(text)}")
+
+
+__all__ = ["DebertaV2Tokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py b/docs/transformers/build/lib/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..784e82995419029d1b560aafaacab1aa87ddd0a9
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
@@ -0,0 +1,223 @@
+# coding=utf-8
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for model DeBERTa."""
+
+import os
+from shutil import copyfile
+from typing import Optional, Tuple
+
+from ...file_utils import is_sentencepiece_available
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+
+
+if is_sentencepiece_available():
+ from .tokenization_deberta_v2 import DebertaV2Tokenizer
+else:
+ DebertaV2Tokenizer = None
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model", "tokenizer_file": "tokenizer.json"}
+
+
+class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
+ r"""
+ Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+ Args:
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+ contains the vocabulary necessary to instantiate a tokenizer.
+ do_lower_case (`bool`, *optional*, defaults to `False`):
+ Whether or not to lowercase the input when tokenizing.
+ bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+ The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+ eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+ The end of sequence token. When building a sequence using special tokens, this is not the token that is
+ used for the end of sequence. The token used is the `sep_token`.
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+ sequence classification or for a text and a question for question answering. It is also used as the last
+ token of a sequence built with special tokens.
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+ The token used for padding, for example when batching sequences of different lengths.
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+ The token used for masking values. This is the token used when training this model with masked language
+ modeling. This is the token which the model will try to predict.
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+ to set:
+
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ using forward-filtering-and-backward-sampling algorithm.
+
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ BPE-dropout.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ slow_tokenizer_class = DebertaV2Tokenizer
+
+ def __init__(
+ self,
+ vocab_file=None,
+ tokenizer_file=None,
+ do_lower_case=False,
+ split_by_punct=False,
+ bos_token="[CLS]",
+ eos_token="[SEP]",
+ unk_token="[UNK]",
+ sep_token="[SEP]",
+ pad_token="[PAD]",
+ cls_token="[CLS]",
+ mask_token="[MASK]",
+ **kwargs,
+ ) -> None:
+ super().__init__(
+ vocab_file,
+ tokenizer_file=tokenizer_file,
+ do_lower_case=do_lower_case,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ split_by_punct=split_by_punct,
+ **kwargs,
+ )
+
+ self.do_lower_case = do_lower_case
+ self.split_by_punct = split_by_punct
+ self.vocab_file = vocab_file
+
+ @property
+ def can_save_slow_tokenizer(self) -> bool:
+ return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ """
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+ adding special tokens. A DeBERTa sequence has the following format:
+
+ - single sequence: [CLS] X [SEP]
+ - pair of sequences: [CLS] A [SEP] B [SEP]
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs to which the special tokens will be added.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+ """
+
+ if token_ids_1 is None:
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+ cls = [self.cls_token_id]
+ sep = [self.sep_token_id]
+ return cls + token_ids_0 + sep + token_ids_1 + sep
+
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+ """
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is not None:
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+ return [1] + ([0] * len(token_ids_0)) + [1]
+
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+ sequence pair mask has the following format:
+
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
+
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+ """
+ sep = [self.sep_token_id]
+ cls = [self.cls_token_id]
+ if token_ids_1 is None:
+ return len(cls + token_ids_0 + sep) * [0]
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not self.can_save_slow_tokenizer:
+ raise ValueError(
+ "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+ "tokenizer."
+ )
+
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+
+ return (out_vocab_file,)
+
+
+__all__ = ["DebertaV2TokenizerFast"]
diff --git a/docs/transformers/build/lib/transformers/models/decision_transformer/__init__.py b/docs/transformers/build/lib/transformers/models/decision_transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..455f7ffec5dee0567039cfd844588cb991c15622
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/decision_transformer/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_decision_transformer import *
+ from .modeling_decision_transformer import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/decision_transformer/configuration_decision_transformer.py b/docs/transformers/build/lib/transformers/models/decision_transformer/configuration_decision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e677206aa089ce6befe5255f37868267f5b5bbc2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/decision_transformer/configuration_decision_transformer.py
@@ -0,0 +1,157 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decision Transformer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DecisionTransformerConfig(PretrainedConfig):
+ """
+ This is the configuration class to store the configuration of a [`DecisionTransformerModel`]. It is used to
+ instantiate a Decision Transformer model according to the specified arguments, defining the model architecture.
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the standard
+ DecisionTransformer architecture. Many of the config options are used to instatiate the GPT2 model that is used as
+ part of the architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ state_dim (`int`, *optional*, defaults to 17):
+ The state size for the RL environment
+ act_dim (`int`, *optional*, defaults to 4):
+ The size of the output action space
+ hidden_size (`int`, *optional*, defaults to 128):
+ The size of the hidden layers
+ max_ep_len (`int`, *optional*, defaults to 4096):
+ The maximum length of an episode in the environment
+ action_tanh (`bool`, *optional*, defaults to True):
+ Whether to use a tanh activation on action prediction
+ vocab_size (`int`, *optional*, defaults to 50257):
+ Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`DecisionTransformerModel`].
+ n_positions (`int`, *optional*, defaults to 1024):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ n_layer (`int`, *optional*, defaults to 3):
+ Number of hidden layers in the Transformer encoder.
+ n_head (`int`, *optional*, defaults to 1):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ n_inner (`int`, *optional*):
+ Dimensionality of the inner feed-forward layers. If unset, will default to 4 times `n_embd`.
+ activation_function (`str`, *optional*, defaults to `"gelu"`):
+ Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ embd_pdrop (`int`, *optional*, defaults to 0.1):
+ The dropout ratio for the embeddings.
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
+ The dropout ratio for the attention.
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+ The epsilon to use in the layer normalization layers.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ scale_attn_weights (`bool`, *optional*, defaults to `True`):
+ Scale attention weights by dividing by sqrt(hidden_size)..
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models).
+ scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+ Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+ reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+ Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+ dot-product/softmax to float() when training with mixed precision.
+
+ Example:
+
+ ```python
+ >>> from transformers import DecisionTransformerConfig, DecisionTransformerModel
+
+ >>> # Initializing a DecisionTransformer configuration
+ >>> configuration = DecisionTransformerConfig()
+
+ >>> # Initializing a model (with random weights) from the configuration
+ >>> model = DecisionTransformerModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "decision_transformer"
+ keys_to_ignore_at_inference = ["past_key_values"]
+ attribute_map = {
+ "max_position_embeddings": "n_positions",
+ "num_attention_heads": "n_head",
+ "num_hidden_layers": "n_layer",
+ }
+
+ def __init__(
+ self,
+ state_dim=17,
+ act_dim=4,
+ hidden_size=128,
+ max_ep_len=4096,
+ action_tanh=True,
+ vocab_size=1,
+ n_positions=1024,
+ n_layer=3,
+ n_head=1,
+ n_inner=None,
+ activation_function="relu",
+ resid_pdrop=0.1,
+ embd_pdrop=0.1,
+ attn_pdrop=0.1,
+ layer_norm_epsilon=1e-5,
+ initializer_range=0.02,
+ scale_attn_weights=True,
+ use_cache=True,
+ bos_token_id=50256,
+ eos_token_id=50256,
+ scale_attn_by_inverse_layer_idx=False,
+ reorder_and_upcast_attn=False,
+ **kwargs,
+ ):
+ self.state_dim = state_dim
+ self.act_dim = act_dim
+ self.hidden_size = hidden_size
+ self.max_ep_len = max_ep_len
+ self.action_tanh = action_tanh
+ self.vocab_size = vocab_size
+ self.n_positions = n_positions
+ self.n_layer = n_layer
+ self.n_head = n_head
+ self.n_inner = n_inner
+ self.activation_function = activation_function
+ self.resid_pdrop = resid_pdrop
+ self.embd_pdrop = embd_pdrop
+ self.attn_pdrop = attn_pdrop
+ self.layer_norm_epsilon = layer_norm_epsilon
+ self.initializer_range = initializer_range
+ self.scale_attn_weights = scale_attn_weights
+ self.use_cache = use_cache
+ self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+ self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+ self.bos_token_id = bos_token_id
+ self.eos_token_id = eos_token_id
+
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+__all__ = ["DecisionTransformerConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/decision_transformer/modeling_decision_transformer.py b/docs/transformers/build/lib/transformers/models/decision_transformer/modeling_decision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..54000b8f2438490f26aa6faf3a6ab006bd77d6a9
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -0,0 +1,988 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DecisionTransformer model."""
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_decision_transformer import DecisionTransformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "edbeeching/decision-transformer-gym-hopper-medium"
+_CONFIG_FOR_DOC = "DecisionTransformerConfig"
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.load_tf_weights_in_gpt2
+def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
+ """Load tf checkpoints in a pytorch model"""
+ try:
+ import re
+
+ import tensorflow as tf
+ except ImportError:
+ logger.error(
+ "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+ "https://www.tensorflow.org/install/ for installation instructions."
+ )
+ raise
+ tf_path = os.path.abspath(gpt2_checkpoint_path)
+ logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+ # Load weights from TF model
+ init_vars = tf.train.list_variables(tf_path)
+ names = []
+ arrays = []
+ for name, shape in init_vars:
+ logger.info(f"Loading TF weight {name} with shape {shape}")
+ array = tf.train.load_variable(tf_path, name)
+ names.append(name)
+ arrays.append(array.squeeze())
+
+ for name, array in zip(names, arrays):
+ name = name[6:] # skip "model/"
+ name = name.split("/")
+ pointer = model
+ for m_name in name:
+ if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+ scope_names = re.split(r"(\d+)", m_name)
+ else:
+ scope_names = [m_name]
+ if scope_names[0] == "w" or scope_names[0] == "g":
+ pointer = getattr(pointer, "weight")
+ elif scope_names[0] == "b":
+ pointer = getattr(pointer, "bias")
+ elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+ pointer = getattr(pointer, scope_names[0])
+ pointer = getattr(pointer, "weight")
+ else:
+ pointer = getattr(pointer, scope_names[0])
+ if len(scope_names) >= 2:
+ num = int(scope_names[1])
+ pointer = pointer[num]
+ try:
+ if pointer.shape != array.shape:
+ raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+ except ValueError as e:
+ e.args += (pointer.shape, array.shape)
+ raise
+ logger.info(f"Initialize PyTorch weight {name}")
+ pointer.data = torch.from_numpy(array)
+ return model
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.eager_attention_forward
+def eager_attention_forward(module, query, key, value, attention_mask, head_mask=None, **kwargs):
+ attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+ if module.scale_attn_weights:
+ attn_weights = attn_weights / torch.full(
+ [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+ )
+
+ # Layer-wise attention scaling
+ if module.scale_attn_by_inverse_layer_idx:
+ attn_weights = attn_weights / float(module.layer_idx + 1)
+
+ if not module.is_cross_attention:
+ # if only "normal" attention layer implements causal mask
+ query_length, key_length = query.size(-2), key.size(-2)
+ causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length]
+ mask_value = torch.finfo(attn_weights.dtype).min
+ # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+ # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+ mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+ attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
+
+ if attention_mask is not None:
+ # Apply the attention mask
+ causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+ attn_weights = attn_weights + causal_mask
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+ # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+ attn_weights = attn_weights.type(value.dtype)
+ attn_weights = module.attn_dropout(attn_weights)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attn_weights = attn_weights * head_mask
+
+ attn_output = torch.matmul(attn_weights, value)
+ attn_output = attn_output.transpose(1, 2)
+
+ return attn_output, attn_weights
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.GPT2Attention with GPT2->DecisionTransformerGPT2
+class DecisionTransformerGPT2Attention(nn.Module):
+ def __init__(self, config, is_cross_attention=False, layer_idx=None):
+ super().__init__()
+ self.config = config
+ max_positions = config.max_position_embeddings
+ self.register_buffer(
+ "bias",
+ torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+ 1, 1, max_positions, max_positions
+ ),
+ persistent=False,
+ )
+ self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+
+ self.embed_dim = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.embed_dim // self.num_heads
+ self.split_size = self.embed_dim
+ if self.head_dim * self.num_heads != self.embed_dim:
+ raise ValueError(
+ f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+ f" {self.num_heads})."
+ )
+
+ self.scale_attn_weights = config.scale_attn_weights
+ self.is_cross_attention = is_cross_attention
+
+ # Layer-wise attention scaling, reordering, and upcasting
+ self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+ self.layer_idx = layer_idx
+ self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+
+ if self.is_cross_attention:
+ self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+ self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+ else:
+ self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+ self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
+ self.is_causal = True
+
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+ index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+
+ # Prune conv1d layers
+ self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+ self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
+ # Update hyper params
+ self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+ self.num_heads = self.num_heads - len(heads)
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+ # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+ bsz, num_heads, q_seq_len, dk = query.size()
+ _, _, k_seq_len, _ = key.size()
+
+ # Preallocate attn_weights for `baddbmm`
+ attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+
+ # Compute Scale Factor
+ scale_factor = 1.0
+ if self.scale_attn_weights:
+ scale_factor /= float(value.size(-1)) ** 0.5
+
+ if self.scale_attn_by_inverse_layer_idx:
+ scale_factor /= float(self.layer_idx + 1)
+
+ # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
+ with torch.amp.autocast(query.device.type, enabled=False):
+ q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+ attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+ attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+
+ if not self.is_cross_attention:
+ # if only "normal" attention layer implements causal mask
+ query_length, key_length = query.size(-2), key.size(-2)
+ causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+ mask_value = torch.finfo(attn_weights.dtype).min
+ # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+ # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+ mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+ attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+ if attention_mask is not None:
+ # Apply the attention mask
+ attn_weights = attn_weights + attention_mask
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+ # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+ if attn_weights.dtype != torch.float32:
+ raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+ attn_weights = attn_weights.type(value.dtype)
+ attn_weights = self.attn_dropout(attn_weights)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attn_weights = attn_weights * head_mask
+
+ attn_output = torch.matmul(attn_weights, value)
+ attn_output = attn_output.transpose(1, 2)
+
+ return attn_output, attn_weights
+
+ @deprecate_kwarg("layer_past", new_name="past_key_value", version="4.53.0", raise_if_both_names=True)
+ def forward(
+ self,
+ hidden_states: Optional[Tuple[torch.FloatTensor]],
+ past_key_value: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = False,
+ **kwargs,
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+ is_cross_attention = encoder_hidden_states is not None
+ if is_cross_attention:
+ if not hasattr(self, "q_attn"):
+ raise ValueError(
+ "If class is used as cross attention, the weights `q_attn` have to be defined. "
+ "Please make sure to instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`."
+ )
+
+ query_states = self.q_attn(hidden_states)
+ key_states, value_states = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+ attention_mask = encoder_attention_mask
+ else:
+ query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+
+ shape_q = (*query_states.shape[:-1], -1, self.head_dim)
+ shape_kv = (*key_states.shape[:-1], -1, self.head_dim)
+
+ query_states = query_states.view(shape_q).transpose(1, 2)
+ key_states = key_states.view(shape_kv).transpose(1, 2)
+ value_states = value_states.view(shape_kv).transpose(1, 2)
+
+ if past_key_value is not None:
+ if isinstance(past_key_value, EncoderDecoderCache):
+ if is_cross_attention:
+ past_key_value = past_key_value.cross_attention_cache
+ else:
+ past_key_value = past_key_value.self_attention_cache
+ cache_kwargs = {"cache_position": cache_position}
+ key_states, value_states = past_key_value.update(
+ key_states, value_states, self.layer_idx, cache_kwargs=cache_kwargs
+ )
+
+ is_causal = attention_mask is None and query_states.shape[-2] > 1 and not is_cross_attention
+
+ using_eager = self.config._attn_implementation == "eager"
+ attention_interface: Callable = eager_attention_forward
+ if self.config._attn_implementation != "eager":
+ if self.config._attn_implementation == "sdpa" and (output_attentions or head_mask is not None):
+ using_eager = True
+ logger.warning_once(
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ else:
+ # Attention functions are consistent with previous equivalent attention classes, however they do not support some options
+ # (e.g. layer scaling, head mask) that eager supports. These implementations are thus equivalent to previous code, but
+ # not necessarily to eager (if mentionned options are provided).
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+ if using_eager and self.reorder_and_upcast_attn:
+ attn_output, attn_weights = self._upcast_and_reordered_attn(
+ query_states, key_states, value_states, attention_mask, head_mask
+ )
+ else:
+ attn_output, attn_weights = attention_interface(
+ self,
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ head_mask=head_mask,
+ dropout=self.attn_dropout.p if self.training else 0.0,
+ is_causal=is_causal,
+ **kwargs,
+ )
+
+ attn_output = attn_output.reshape(*attn_output.shape[:-2], -1).contiguous()
+ attn_output = self.c_proj(attn_output)
+ attn_output = self.resid_dropout(attn_output)
+
+ return attn_output, attn_weights
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP with GPT2->DecisionTransformerGPT2
+class DecisionTransformerGPT2MLP(nn.Module):
+ def __init__(self, intermediate_size, config):
+ super().__init__()
+ embed_dim = config.hidden_size
+ self.c_fc = Conv1D(intermediate_size, embed_dim)
+ self.c_proj = Conv1D(embed_dim, intermediate_size)
+ self.act = ACT2FN[config.activation_function]
+ self.dropout = nn.Dropout(config.resid_pdrop)
+
+ def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+ hidden_states = self.c_fc(hidden_states)
+ hidden_states = self.act(hidden_states)
+ hidden_states = self.c_proj(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ return hidden_states
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.GPT2Block with GPT2->DecisionTransformerGPT2
+class DecisionTransformerGPT2Block(nn.Module):
+ # Ignore copy
+ def __init__(self, config, layer_idx=None):
+ super().__init__()
+ hidden_size = config.hidden_size
+ inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+ self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+ self.attn = DecisionTransformerGPT2Attention(config, layer_idx=layer_idx)
+ self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+ if config.add_cross_attention:
+ self.crossattention = DecisionTransformerGPT2Attention(
+ config, is_cross_attention=True, layer_idx=layer_idx
+ )
+ self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+ self.mlp = DecisionTransformerGPT2MLP(inner_dim, config)
+
+ @deprecate_kwarg("layer_past", new_name="past_key_value", version="4.53.0", raise_if_both_names=True)
+ def forward(
+ self,
+ hidden_states: Optional[Tuple[torch.FloatTensor]],
+ past_key_value: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = False,
+ output_attentions: Optional[bool] = False,
+ ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+ residual = hidden_states
+ hidden_states = self.ln_1(hidden_states)
+ attn_output, self_attn_weights = self.attn(
+ hidden_states,
+ past_key_value=past_key_value,
+ cache_position=cache_position,
+ attention_mask=attention_mask,
+ head_mask=head_mask,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ )
+ # residual connection
+ hidden_states = attn_output + residual
+
+ if encoder_hidden_states is not None:
+ # add one self-attention block for cross-attention
+ if not hasattr(self, "crossattention"):
+ raise ValueError(
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+ "cross-attention layers by setting `config.add_cross_attention=True`"
+ )
+ residual = hidden_states
+ hidden_states = self.ln_cross_attn(hidden_states)
+ cross_attn_output, cross_attn_weights = self.crossattention(
+ hidden_states,
+ past_key_value=past_key_value,
+ attention_mask=attention_mask,
+ head_mask=head_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ output_attentions=output_attentions,
+ )
+ # residual connection
+ hidden_states = residual + cross_attn_output
+
+ residual = hidden_states
+ hidden_states = self.ln_2(hidden_states)
+ feed_forward_hidden_states = self.mlp(hidden_states)
+ # residual connection
+ hidden_states = residual + feed_forward_hidden_states
+
+ outputs = (hidden_states,)
+ if output_attentions:
+ outputs += (self_attn_weights,)
+ if encoder_hidden_states is not None:
+ outputs += (cross_attn_weights,)
+
+ return outputs
+
+
+class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = DecisionTransformerConfig
+ load_tf_weights = load_tf_weights_in_gpt2
+ base_model_prefix = "transformer"
+ is_parallelizable = True
+ supports_gradient_checkpointing = True
+ _supports_cache_class = True
+ _supports_static_cache = False
+
+ def __init__(self, *inputs, **kwargs):
+ super().__init__(*inputs, **kwargs)
+
+ def _init_weights(self, module):
+ """Initialize the weights."""
+ if isinstance(module, (nn.Linear, Conv1D)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+ # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+ # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+ # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+ # > -- GPT-2 :: https://openai.com/blog/better-language-models/
+ #
+ # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+ for name, p in module.named_parameters():
+ if "c_proj" in name and "weight" in name:
+ # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+ p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
+
+
+class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.embed_dim = config.hidden_size
+
+ self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+ self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+ self.drop = nn.Dropout(config.embd_pdrop)
+ self.h = nn.ModuleList(
+ [DecisionTransformerGPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
+ )
+ self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+ # Model parallel
+ self.model_parallel = False
+ self.device_map = None
+ self.gradient_checkpointing = False
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.wte
+
+ def set_input_embeddings(self, new_embeddings):
+ self.wte = new_embeddings
+
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+ input_shape = input_ids.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+ batch_size = input_ids.shape[0]
+ elif inputs_embeds is not None:
+ input_shape = inputs_embeds.size()[:-1]
+ batch_size = inputs_embeds.shape[0]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+ if token_type_ids is not None:
+ token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+ # based on pattern from src/transformers/models/whisper/modeling_whisper.py::WhisperDecoder and similar addition in GPT2Model
+ return_legacy_cache = False
+ if use_cache:
+ if past_key_values is None:
+ return_legacy_cache = True
+ past_key_values = DynamicCache()
+ elif not isinstance(past_key_values, Cache):
+ return_legacy_cache = True
+ logger.warning_once(
+ "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. "
+ "You should pass an instance of `Cache` instead, e.g. "
+ "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`."
+ )
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+
+ if self.config.add_cross_attention and not isinstance(past_key_values, EncoderDecoderCache):
+ past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
+
+ if cache_position is None:
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ cache_position = torch.arange(
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+ )
+ if position_ids is None:
+ position_ids = cache_position.unsqueeze(0)
+
+ # Attention mask.
+ if attention_mask is not None:
+ if batch_size <= 0:
+ raise ValueError("batch_size has to be defined and > 0")
+ attention_mask = attention_mask.view(batch_size, -1)
+ # We create a 3D attention mask from a 2D tensor mask.
+ # Sizes are [batch_size, 1, 1, to_seq_length]
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+ # this attention mask is more simple than the triangular masking of causal attention
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+ attention_mask = attention_mask[:, None, None, :]
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and the dtype's smallest value for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
+ attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+ # If a 2D or 3D attention mask is provided for the cross-attention
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ if self.config.add_cross_attention and encoder_hidden_states is not None:
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+ if encoder_attention_mask is None:
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+ encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+ else:
+ encoder_attention_mask = None
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # head_mask has shape n_layer x batch x n_heads x N x N
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.wte(input_ids)
+ position_embeds = self.wpe(position_ids)
+ hidden_states = inputs_embeds + position_embeds
+
+ if token_type_ids is not None:
+ token_type_embeds = self.wte(token_type_ids)
+ hidden_states = hidden_states + token_type_embeds
+
+ hidden_states = self.drop(hidden_states)
+
+ output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ all_self_attentions = () if output_attentions else None
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+ all_hidden_states = () if output_hidden_states else None
+ for i, block in enumerate(self.h):
+ # Model parallel
+ if self.model_parallel:
+ torch.cuda.set_device(hidden_states.device)
+ # Ensure that attention_mask is always on the same device as hidden_states
+ if attention_mask is not None:
+ attention_mask = attention_mask.to(hidden_states.device)
+ if isinstance(head_mask, torch.Tensor):
+ head_mask = head_mask.to(hidden_states.device)
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ outputs = self._gradient_checkpointing_func(
+ block.__call__,
+ hidden_states,
+ None,
+ None,
+ attention_mask,
+ head_mask[i],
+ encoder_hidden_states,
+ encoder_attention_mask,
+ use_cache,
+ output_attentions,
+ )
+ else:
+ outputs = block(
+ hidden_states,
+ past_key_value=past_key_values,
+ cache_position=cache_position,
+ attention_mask=attention_mask,
+ head_mask=head_mask[i],
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = outputs[0]
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (outputs[1],)
+ if self.config.add_cross_attention:
+ all_cross_attentions = all_cross_attentions + (outputs[2],)
+
+ # Model Parallel: If it's the last layer for that device, put things on the next device
+ if self.model_parallel:
+ for k, v in self.device_map.items():
+ if i == v[-1] and "cuda:" + str(k) != self.last_device:
+ hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+ hidden_states = self.ln_f(hidden_states)
+
+ hidden_states = hidden_states.view(output_shape)
+ # Add last hidden state
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ past_key_values = past_key_values if use_cache else None
+ if return_legacy_cache:
+ past_key_values = (
+ past_key_values.self_attention_cache.to_legacy_cache()
+ if self.config.add_cross_attention
+ else past_key_values.to_legacy_cache()
+ )
+ if not return_dict:
+ return tuple(
+ v
+ for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions]
+ if v is not None
+ )
+
+ return BaseModelOutputWithPastAndCrossAttentions(
+ last_hidden_state=hidden_states,
+ past_key_values=past_key_values,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ cross_attentions=all_cross_attentions,
+ )
+
+
+@dataclass
+class DecisionTransformerOutput(ModelOutput):
+ """
+ Base class for model's outputs that also contains a pooling of the last hidden states.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`):
+ Environment state predictions
+ action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`):
+ Model action predictions
+ return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`):
+ Predicted returns for each state
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ state_preds: Optional[torch.FloatTensor] = None
+ action_preds: Optional[torch.FloatTensor] = None
+ return_preds: Optional[torch.FloatTensor] = None
+ hidden_states: Optional[torch.FloatTensor] = None
+ attentions: Optional[torch.FloatTensor] = None
+ last_hidden_state: Optional[torch.FloatTensor] = None
+
+
+class DecisionTransformerPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = DecisionTransformerConfig
+ base_model_prefix = "decision_transformer"
+ main_input_name = "states"
+ supports_gradient_checkpointing = False
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, nn.Linear):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+
+DECISION_TRANSFORMER_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`~DecisionTransformerConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DECISION_TRANSFORMER_INPUTS_DOCSTRING = r"""
+ Args:
+ states (`torch.FloatTensor` of shape `(batch_size, episode_length, state_dim)`):
+ The states for each step in the trajectory
+ actions (`torch.FloatTensor` of shape `(batch_size, episode_length, act_dim)`):
+ The actions taken by the "expert" policy for the current state, these are masked for auto regressive
+ prediction
+ rewards (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`):
+ The rewards for each state, action
+ returns_to_go (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`):
+ The returns for each state in the trajectory
+ timesteps (`torch.LongTensor` of shape `(batch_size, episode_length)`):
+ The timestep for each step in the trajectory
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, episode_length)`):
+ Masking, used to mask the actions when performing autoregressive prediction
+"""
+
+
+@add_start_docstrings("The Decision Transformer Model", DECISION_TRANSFORMER_START_DOCSTRING)
+class DecisionTransformerModel(DecisionTransformerPreTrainedModel):
+ """
+
+ The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL
+ setting. Refer to the paper for more details: https://arxiv.org/abs/2106.01345
+
+ """
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.config = config
+ self.hidden_size = config.hidden_size
+ # note: the only difference between this GPT2Model and the default Huggingface version
+ # is that the positional embeddings are removed (since we'll add those ourselves)
+ self.encoder = DecisionTransformerGPT2Model(config)
+
+ self.embed_timestep = nn.Embedding(config.max_ep_len, config.hidden_size)
+ self.embed_return = torch.nn.Linear(1, config.hidden_size)
+ self.embed_state = torch.nn.Linear(config.state_dim, config.hidden_size)
+ self.embed_action = torch.nn.Linear(config.act_dim, config.hidden_size)
+
+ self.embed_ln = nn.LayerNorm(config.hidden_size)
+
+ # note: we don't predict states or returns for the paper
+ self.predict_state = torch.nn.Linear(config.hidden_size, config.state_dim)
+ self.predict_action = nn.Sequential(
+ *([nn.Linear(config.hidden_size, config.act_dim)] + ([nn.Tanh()] if config.action_tanh else []))
+ )
+ self.predict_return = torch.nn.Linear(config.hidden_size, 1)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DECISION_TRANSFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @replace_return_docstrings(output_type=DecisionTransformerOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ states: Optional[torch.FloatTensor] = None,
+ actions: Optional[torch.FloatTensor] = None,
+ rewards: Optional[torch.FloatTensor] = None,
+ returns_to_go: Optional[torch.FloatTensor] = None,
+ timesteps: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.FloatTensor], DecisionTransformerOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import DecisionTransformerModel
+ >>> import torch
+
+ >>> model = DecisionTransformerModel.from_pretrained("edbeeching/decision-transformer-gym-hopper-medium")
+ >>> # evaluation
+ >>> model = model.to(device)
+ >>> model.eval()
+
+ >>> env = gym.make("Hopper-v3")
+ >>> state_dim = env.observation_space.shape[0]
+ >>> act_dim = env.action_space.shape[0]
+
+ >>> state = env.reset()
+ >>> states = torch.from_numpy(state).reshape(1, 1, state_dim).to(device=device, dtype=torch.float32)
+ >>> actions = torch.zeros((1, 1, act_dim), device=device, dtype=torch.float32)
+ >>> rewards = torch.zeros(1, 1, device=device, dtype=torch.float32)
+ >>> target_return = torch.tensor(TARGET_RETURN, dtype=torch.float32).reshape(1, 1)
+ >>> timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)
+ >>> attention_mask = torch.zeros(1, 1, device=device, dtype=torch.float32)
+
+ >>> # forward pass
+ >>> with torch.no_grad():
+ ... state_preds, action_preds, return_preds = model(
+ ... states=states,
+ ... actions=actions,
+ ... rewards=rewards,
+ ... returns_to_go=target_return,
+ ... timesteps=timesteps,
+ ... attention_mask=attention_mask,
+ ... return_dict=False,
+ ... )
+ ```"""
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ batch_size, seq_length = states.shape[0], states.shape[1]
+
+ if attention_mask is None:
+ # attention mask for GPT: 1 if can be attended to, 0 if not
+ attention_mask = torch.ones((batch_size, seq_length), dtype=torch.long)
+
+ # embed each modality with a different head
+ state_embeddings = self.embed_state(states)
+ action_embeddings = self.embed_action(actions)
+ returns_embeddings = self.embed_return(returns_to_go)
+ time_embeddings = self.embed_timestep(timesteps)
+
+ # time embeddings are treated similar to positional embeddings
+ state_embeddings = state_embeddings + time_embeddings
+ action_embeddings = action_embeddings + time_embeddings
+ returns_embeddings = returns_embeddings + time_embeddings
+
+ # this makes the sequence look like (R_1, s_1, a_1, R_2, s_2, a_2, ...)
+ # which works nice in an autoregressive sense since states predict actions
+ stacked_inputs = (
+ torch.stack((returns_embeddings, state_embeddings, action_embeddings), dim=1)
+ .permute(0, 2, 1, 3)
+ .reshape(batch_size, 3 * seq_length, self.hidden_size)
+ )
+ stacked_inputs = self.embed_ln(stacked_inputs)
+
+ # to make the attention mask fit the stacked inputs, have to stack it as well
+ stacked_attention_mask = (
+ torch.stack((attention_mask, attention_mask, attention_mask), dim=1)
+ .permute(0, 2, 1)
+ .reshape(batch_size, 3 * seq_length)
+ )
+ device = stacked_inputs.device
+ # we feed in the input embeddings (not word indices as in NLP) to the model
+ encoder_outputs = self.encoder(
+ inputs_embeds=stacked_inputs,
+ attention_mask=stacked_attention_mask,
+ position_ids=torch.zeros(stacked_attention_mask.shape, device=device, dtype=torch.long),
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ x = encoder_outputs[0]
+
+ # reshape x so that the second dimension corresponds to the original
+ # returns (0), states (1), or actions (2); i.e. x[:,1,t] is the token for s_t
+ x = x.reshape(batch_size, seq_length, 3, self.hidden_size).permute(0, 2, 1, 3)
+
+ # get predictions
+ return_preds = self.predict_return(x[:, 2]) # predict next return given state and action
+ state_preds = self.predict_state(x[:, 2]) # predict next state given state and action
+ action_preds = self.predict_action(x[:, 1]) # predict next action given state
+ if not return_dict:
+ return (state_preds, action_preds, return_preds)
+
+ return DecisionTransformerOutput(
+ last_hidden_state=encoder_outputs.last_hidden_state,
+ state_preds=state_preds,
+ action_preds=action_preds,
+ return_preds=return_preds,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+__all__ = [
+ "DecisionTransformerGPT2Model",
+ "DecisionTransformerGPT2PreTrainedModel",
+ "DecisionTransformerModel",
+ "DecisionTransformerPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/deepseek_v3/__init__.py b/docs/transformers/build/lib/transformers/models/deepseek_v3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..298f4c968375e6d90728a3d4e78c5046e4591c01
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deepseek_v3/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_deepseek_v3 import *
+ from .modeling_deepseek_v3 import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/docs/transformers/build/lib/transformers/models/deepseek_v3/configuration_deepseek_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..82b8701cb577975c4fba982f60a0f92ba7e66ea1
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deepseek_v3/configuration_deepseek_v3.py
@@ -0,0 +1,253 @@
+# coding=utf-8
+# Copyright 2025 bzantium and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on the DeepSeekV3 implementations from the DeepSeek AI team. (https://huggingface.co/deepseek-ai/DeepSeek-V3)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DeepSeekV3 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class DeepseekV3Config(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
+ e.g. [bzantium/tiny-deepseek-v3](https://huggingface.co/bzantium/tiny-deepseek-v3)
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 129280):
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
+ hidden_size (`int`, *optional*, defaults to 7168):
+ Dimension of the hidden representations.
+ intermediate_size (`int`, *optional*, defaults to 18432):
+ Dimension of the MLP representations.
+ moe_intermediate_size (`int`, *optional*, defaults to 2048):
+ Dimension of the MoE representations.
+ num_hidden_layers (`int`, *optional*, defaults to 61):
+ Number of hidden layers in the Transformer decoder.
+ num_attention_heads (`int`, *optional*, defaults to 128):
+ Number of attention heads for each attention layer in the Transformer decoder.
+ num_key_value_heads (`int`, *optional*, defaults to 128):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+ `num_attention_heads`.
+ n_shared_experts (`int`, *optional*, defaults to 1):
+ Number of shared experts.
+ n_routed_experts (`int`, *optional*, defaults to 256):
+ Number of routed experts.
+ routed_scaling_factor (`float`, *optional*, defaults to 2.5):
+ Scaling factor or routed experts.
+ kv_lora_rank (`int`, *optional*, defaults to 512):
+ Rank of the LoRA matrices for key and value projections.
+ q_lora_rank (`int`, *optional*, defaults to 1536):
+ Rank of the LoRA matrices for query projections.
+ qk_rope_head_dim (`int`, *optional*, defaults to 64):
+ Dimension of the query/key heads that use rotary position embeddings.
+ v_head_dim (`int`, *optional*, defaults to 128):
+ Dimension of the value heads.
+ qk_nope_head_dim (`int`, *optional*, defaults to 128):
+ Dimension of the query/key heads that don't use rotary position embeddings.
+ n_group (`int`, *optional*, defaults to 8):
+ Number of groups for routed experts.
+ topk_group (`int`, *optional*, defaults to 4):
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+ num_experts_per_tok (`int`, *optional*, defaults to 8):
+ Number of selected experts, None means dense model.
+ first_k_dense_replace (`int`, *optional*, defaults to 3):
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+ \--k dense layers--/
+ norm_topk_prob (`bool`, *optional*, defaults to `True`):
+ Whether to normalize the weights of the routed experts.
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
+ The maximum sequence length that this model might ever be used with.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+ The epsilon used by the rms normalization layers.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ pad_token_id (`int`, *optional*):
+ Padding token id.
+ bos_token_id (`int`, *optional*, defaults to 0):
+ Beginning of stream token id.
+ eos_token_id (`int`, *optional*, defaults to 1):
+ End of stream token id.
+ pretraining_tp (`int`, *optional*, defaults to 1):
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+ issue](https://github.com/pytorch/pytorch/issues/76232).
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+ Whether to tie weight embeddings
+ rope_theta (`float`, *optional*, defaults to 10000.0):
+ The base period of the RoPE embeddings.
+ rope_scaling (`Dict`, *optional*):
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+ `max_position_embeddings` to the expected new maximum.
+ rope_interleave (`bool`, *optional*, defaults to `True`):
+ Whether to interleave the rotary position embeddings.
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+
+ ```python
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
+
+ >>> # Initializing a Deepseek-V3 style configuration
+ >>> configuration = DeepseekV3Config()
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "deepseek_v3"
+ keys_to_ignore_at_inference = ["past_key_values"]
+ base_model_tp_plan = { # TODO: only replicate attention layers when > first_k_dense_replace
+ "layers.*.mlp.experts.*.gate_proj": "local_colwise",
+ "layers.*.mlp.experts.*.up_proj": "local_colwise",
+ "layers.*.mlp.experts.*.down_proj": "local_rowwise",
+ "layers.*.mlp.experts.*": "local", # each expert is wrapped in a module list
+ "layers.*.mlp.shared_experts.gate_proj": "local_colwise",
+ "layers.*.mlp.shared_experts.up_proj": "local_colwise",
+ "layers.*.mlp.shared_experts.down_proj": "local_rowwise",
+ "layers.*.mlp.shared_experts": "local",
+ "layers.*.mlp.gate_proj": "local_colwise",
+ "layers.*.mlp.up_proj": "local_colwise",
+ "layers.*.mlp.down_proj": "local_rowwise",
+ "layers.*.mlp": "gather", # This is the only moment where results are gathered
+ }
+ base_model_pp_plan = {
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+ "norm": (["hidden_states"], ["hidden_states"]),
+ }
+
+ def __init__(
+ self,
+ vocab_size=129280,
+ hidden_size=7168,
+ intermediate_size=18432,
+ moe_intermediate_size=2048,
+ num_hidden_layers=61,
+ num_attention_heads=128,
+ num_key_value_heads=128,
+ n_shared_experts=1,
+ n_routed_experts=256,
+ routed_scaling_factor=2.5,
+ kv_lora_rank=512,
+ q_lora_rank=1536,
+ qk_rope_head_dim=64,
+ v_head_dim=128,
+ qk_nope_head_dim=128,
+ n_group=8,
+ topk_group=4,
+ num_experts_per_tok=8,
+ first_k_dense_replace=3,
+ norm_topk_prob=True,
+ hidden_act="silu",
+ max_position_embeddings=4096,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=None,
+ bos_token_id=0,
+ eos_token_id=1,
+ pretraining_tp=1,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ rope_interleave=True,
+ attention_bias=False,
+ attention_dropout=0.0,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.moe_intermediate_size = moe_intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.n_shared_experts = n_shared_experts
+ self.n_routed_experts = n_routed_experts
+ self.routed_scaling_factor = routed_scaling_factor
+ self.kv_lora_rank = kv_lora_rank
+ self.q_lora_rank = q_lora_rank
+ self.qk_rope_head_dim = qk_rope_head_dim
+ self.v_head_dim = v_head_dim
+ self.qk_nope_head_dim = qk_nope_head_dim
+ self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+ self.head_dim = qk_rope_head_dim
+ self.n_group = n_group
+ self.topk_group = topk_group
+ self.num_experts_per_tok = num_experts_per_tok
+ self.first_k_dense_replace = first_k_dense_replace
+ self.norm_topk_prob = norm_topk_prob
+ self.rope_interleave = rope_interleave
+
+ # for backward compatibility
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.pretraining_tp = pretraining_tp
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self.attention_bias = attention_bias
+ self.attention_dropout = attention_dropout
+ # Validate the correctness of rotary position embeddings parameters
+ # BC: if there is a 'type' field, copy it it to 'rope_type'.
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+
+ if self.rope_scaling is not None:
+ for key in ["beta_fast", "beta_slow", "factor"]:
+ if key in self.rope_scaling:
+ self.rope_scaling[key] = float(self.rope_scaling[key])
+
+ rope_config_validation(self)
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+
+__all__ = ["DeepseekV3Config"]
diff --git a/docs/transformers/build/lib/transformers/models/deepseek_v3/modeling_deepseek_v3.py b/docs/transformers/build/lib/transformers/models/deepseek_v3/modeling_deepseek_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..511c401f7e39a1b6ed44106245e79ad52e7cd8ea
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deepseek_v3/modeling_deepseek_v3.py
@@ -0,0 +1,1016 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deepseek_v3/modular_deepseek_v3.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deepseek_v3.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+ LossKwargs,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ can_return_tuple,
+ is_torch_flex_attn_available,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_deepseek_v3 import DeepseekV3Config
+
+
+if is_torch_flex_attn_available():
+ from torch.nn.attention.flex_attention import BlockMask
+
+ from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "DeepseekV3Config"
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class DeepseekV3RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ DeepseekV3RMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+ def extra_repr(self):
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class DeepseekV3RotaryEmbedding(nn.Module):
+ def __init__(self, config: DeepseekV3Config, device=None):
+ super().__init__()
+ # BC: "rope_type" was originally "type"
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+ else:
+ self.rope_type = "default"
+ self.max_seq_len_cached = config.max_position_embeddings
+ self.original_max_seq_len = config.max_position_embeddings
+
+ self.config = config
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+ self.original_inv_freq = self.inv_freq
+
+ @torch.no_grad()
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
+ def forward(self, x, position_ids):
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+ position_ids_expanded = position_ids[:, None, :].float()
+
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos() * self.attention_scaling
+ sin = emb.sin() * self.attention_scaling
+
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class DeepseekV3MLP(nn.Module):
+ def __init__(self, config, hidden_size=None, intermediate_size=None):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+ self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+ return down_proj
+
+
+class DeepseekV3TopkRouter(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.top_k = config.num_experts_per_tok
+ self.n_routed_experts = config.n_routed_experts
+ self.routed_scaling_factor = config.routed_scaling_factor
+ self.n_group = config.n_group
+ self.topk_group = config.topk_group
+ self.norm_topk_prob = config.norm_topk_prob
+
+ self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
+ self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts)))
+
+ @torch.no_grad()
+ def get_topk_indices(self, scores):
+ scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
+ group_scores = (
+ scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
+ .topk(2, dim=-1)[0]
+ .sum(dim=-1)
+ )
+ group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+ group_mask = torch.zeros_like(group_scores)
+ group_mask.scatter_(1, group_idx, 1)
+ score_mask = (
+ group_mask.unsqueeze(-1)
+ .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
+ .reshape(-1, self.n_routed_experts)
+ )
+ scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
+ topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
+ return topk_indices
+
+ def forward(self, hidden_states):
+ hidden_states = hidden_states.view(-1, self.config.hidden_size)
+ router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
+ scores = router_logits.sigmoid()
+ topk_indices = self.get_topk_indices(scores)
+ topk_weights = scores.gather(1, topk_indices)
+ if self.norm_topk_prob:
+ denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+ topk_weights /= denominator
+ topk_weights = topk_weights * self.routed_scaling_factor
+ return topk_indices, topk_weights
+
+
+class DeepseekV3MoE(nn.Module):
+ """
+ A mixed expert module containing shared experts.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.experts = nn.ModuleList(
+ [
+ DeepseekV3MLP(config, intermediate_size=config.moe_intermediate_size)
+ for _ in range(config.n_routed_experts)
+ ]
+ )
+ self.gate = DeepseekV3TopkRouter(config)
+ self.shared_experts = DeepseekV3MLP(
+ config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
+ )
+
+ def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+ r"""
+ CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
+ to not have to do a loop here (deepseek has 256 experts soooo yeah).
+ """
+ final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+ expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+ expert_mask = expert_mask.permute(2, 0, 1)
+
+ for expert_idx in range(len(self.experts)):
+ expert = self.experts[expert_idx]
+ mask = expert_mask[expert_idx]
+ token_indices, weight_indices = torch.where(mask)
+
+ if token_indices.numel() > 0:
+ expert_weights = topk_weights[token_indices, weight_indices]
+ expert_input = hidden_states[token_indices]
+ expert_output = expert(expert_input)
+ weighted_output = expert_output * expert_weights.unsqueeze(-1)
+ final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+ # in original deepseek, the output of the experts are gathered once we leave this module
+ # thus the moe module is itelsf an IsolatedParallel module
+ # and all expert are "local" meaning we shard but we don't gather
+ return final_hidden_states.type(hidden_states.dtype)
+
+ def forward(self, hidden_states):
+ residuals = hidden_states
+ orig_shape = hidden_states.shape
+ topk_indices, topk_weights = self.gate(hidden_states)
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+ hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+ hidden_states = hidden_states + self.shared_experts(residuals)
+ return hidden_states
+
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`, *optional*):
+ Deprecated and unused.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+ module: nn.Module,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ value: torch.Tensor,
+ attention_mask: Optional[torch.Tensor],
+ scaling: float,
+ dropout: float = 0.0,
+ **kwargs,
+):
+ key_states = repeat_kv(key, module.num_key_value_groups)
+ value_states = repeat_kv(value, module.num_key_value_groups)
+
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+ if attention_mask is not None:
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+ attn_weights = attn_weights + causal_mask
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ r"""
+ TODO let's just use the original freqcis computation to not have the view
+ transpose + reshape! This is not optimized!
+ Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`):
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+ used to pass offsetted position ids when working with a KV-cache.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+
+ b, h, s, d = q.shape
+ q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ b, h, s, d = k.shape
+ k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+def yarn_get_mscale(scale=1, mscale=1):
+ if scale <= 1:
+ return 1.0
+ return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV3Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: DeepseekV3Config, layer_idx: int):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+ self.attention_dropout = config.attention_dropout
+ self.num_heads = config.num_attention_heads
+ self.rope_theta = config.rope_theta
+ self.q_lora_rank = config.q_lora_rank
+ self.qk_rope_head_dim = config.qk_rope_head_dim
+ self.kv_lora_rank = config.kv_lora_rank
+ self.v_head_dim = config.v_head_dim
+ self.qk_nope_head_dim = config.qk_nope_head_dim
+ self.qk_head_dim = config.qk_head_dim
+
+ self.is_causal = True
+ self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
+ self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
+ self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+
+ self.kv_a_proj_with_mqa = nn.Linear(
+ config.hidden_size,
+ self.kv_lora_rank + self.qk_rope_head_dim,
+ bias=config.attention_bias,
+ )
+ self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank)
+ self.kv_b_proj = nn.Linear(
+ self.kv_lora_rank,
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+ bias=False,
+ )
+
+ self.o_proj = nn.Linear(
+ self.num_heads * self.v_head_dim,
+ config.hidden_size,
+ bias=config.attention_bias,
+ )
+
+ self.scaling = self.qk_head_dim ** (-0.5)
+ if self.config.rope_scaling is not None:
+ mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+ scaling_factor = self.config.rope_scaling["factor"]
+ if mscale_all_dim:
+ mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+ self.scaling = self.scaling * mscale * mscale
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+ attention_mask: Optional[torch.Tensor],
+ past_key_value: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Unpack[FlashAttentionKwargs],
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ batch_size, seq_length = hidden_states.shape[:-1]
+ query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+ key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+
+ q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))).view(query_shape).transpose(1, 2)
+ q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+ compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+ k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+ k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
+ k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+ k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+
+ cos, sin = position_embeddings
+ if self.config.rope_interleave: # support using interleaved weights for efficiency
+ q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
+ else:
+ q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
+ k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+
+ query_states = torch.cat((q_pass, q_rot), dim=-1)
+ key_states = torch.cat((k_pass, k_rot), dim=-1)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+ value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+
+ attention_interface: Callable = eager_attention_forward
+ if self.config._attn_implementation != "eager":
+ if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+ logger.warning_once(
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ else:
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+ attn_output, attn_weights = attention_interface(
+ self,
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ dropout=0.0 if not self.training else self.attention_dropout,
+ scaling=self.scaling,
+ **kwargs,
+ )
+
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+ attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+ attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+ attn_output = self.o_proj(attn_output)
+ return attn_output, attn_weights
+
+
+class DeepseekV3DecoderLayer(GradientCheckpointingLayer):
+ def __init__(self, config: DeepseekV3Config, layer_idx: int):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.self_attn = DeepseekV3Attention(config=config, layer_idx=layer_idx)
+
+ if layer_idx >= config.first_k_dense_replace:
+ self.mlp = DeepseekV3MoE(config)
+ else:
+ self.mlp = DeepseekV3MLP(config)
+
+ self.input_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
+ **kwargs: Unpack[FlashAttentionKwargs],
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ residual = hidden_states
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ position_embeddings=position_embeddings,
+ **kwargs,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ return outputs
+
+
+DEEPSEEK_V3_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`DeepseekV3Config`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+ "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
+ DEEPSEEK_V3_START_DOCSTRING,
+)
+class DeepseekV3PreTrainedModel(PreTrainedModel):
+ config_class = DeepseekV3Config
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["DeepseekV3DecoderLayer"]
+ _skip_keys_device_placement = ["past_key_values"]
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+ _supports_flex_attn = True
+ _supports_cache_class = True
+ _supports_quantized_cache = True
+ _supports_static_cache = True
+ _supports_attention_backend = True
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, DeepseekV3RMSNorm):
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, DeepseekV3TopkRouter):
+ module.weight.data.normal_(mean=0.0, std=std)
+
+
+DEEPSEEK_V3_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
+ but you can also pass a `BlockMask` object directly here.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+ the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+ "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
+ DEEPSEEK_V3_START_DOCSTRING,
+)
+class DeepseekV3Model(DeepseekV3PreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`]
+
+ Args:
+ config: DeepseekV3Config
+ """
+
+ _keys_to_ignore_on_load_unexpected = [r"model\.layers\.61.*"]
+
+ def __init__(self, config: DeepseekV3Config):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList(
+ [DeepseekV3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+ )
+ self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.rotary_emb = DeepseekV3RotaryEmbedding(config=config)
+ self.gradient_checkpointing = False
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ @can_return_tuple
+ @add_start_docstrings_to_model_forward(DEEPSEEK_V3_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+ ) -> BaseModelOutputWithPast:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+ if self.gradient_checkpointing and self.training and use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+ )
+ use_cache = False
+
+ # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+ if not isinstance(past_key_values, (type(None), Cache)):
+ raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+
+ if use_cache and past_key_values is None:
+ past_key_values = DynamicCache()
+
+ if cache_position is None:
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ cache_position = torch.arange(
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+ )
+
+ if position_ids is None:
+ position_ids = cache_position.unsqueeze(0)
+
+ causal_mask = self._update_causal_mask(
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+ )
+
+ hidden_states = inputs_embeds
+
+ # create position embeddings to be shared across the decoder layers
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=causal_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ position_embeddings=position_embeddings,
+ **flash_attn_kwargs,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=past_key_values if use_cache else None,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+ def _update_causal_mask(
+ self,
+ attention_mask: Union[torch.Tensor, "BlockMask"],
+ input_tensor: torch.Tensor,
+ cache_position: torch.Tensor,
+ past_key_values: Cache,
+ output_attentions: bool = False,
+ ):
+ if self.config._attn_implementation == "flash_attention_2":
+ if attention_mask is not None and (attention_mask == 0.0).any():
+ return attention_mask
+ return None
+ if self.config._attn_implementation == "flex_attention":
+ if isinstance(attention_mask, torch.Tensor):
+ attention_mask = make_flex_block_causal_mask(attention_mask)
+ return attention_mask
+
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+ # to infer the attention mask.
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ using_static_cache = isinstance(past_key_values, StaticCache)
+
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+ if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
+ attention_mask,
+ inputs_embeds=input_tensor,
+ past_key_values_length=past_seen_tokens,
+ is_training=self.training,
+ ):
+ return None
+
+ dtype, device = input_tensor.dtype, input_tensor.device
+ sequence_length = input_tensor.shape[1]
+ if using_static_cache:
+ target_length = past_key_values.get_max_cache_shape()
+ else:
+ target_length = (
+ attention_mask.shape[-1]
+ if isinstance(attention_mask, torch.Tensor)
+ else past_seen_tokens + sequence_length + 1
+ )
+
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask,
+ sequence_length=sequence_length,
+ target_length=target_length,
+ dtype=dtype,
+ device=device,
+ cache_position=cache_position,
+ batch_size=input_tensor.shape[0],
+ )
+
+ if (
+ self.config._attn_implementation == "sdpa"
+ and attention_mask is not None
+ and attention_mask.device.type in ["cuda", "xpu", "npu"]
+ and not output_attentions
+ ):
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+ # Details: https://github.com/pytorch/pytorch/issues/110213
+ min_dtype = torch.finfo(dtype).min
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+ return causal_mask
+
+ @staticmethod
+ def _prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask: torch.Tensor,
+ sequence_length: int,
+ target_length: int,
+ dtype: torch.dtype,
+ device: torch.device,
+ cache_position: torch.Tensor,
+ batch_size: int,
+ **kwargs,
+ ):
+ """
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+ Args:
+ attention_mask (`torch.Tensor`):
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+ `(batch_size, 1, query_length, key_value_length)`.
+ sequence_length (`int`):
+ The sequence length being processed.
+ target_length (`int`):
+ The target length: when generating with static cache, the mask should be as long as the static cache,
+ to account for the 0 padding, the part of the cache that is not filled yet.
+ dtype (`torch.dtype`):
+ The dtype to use for the 4D attention mask.
+ device (`torch.device`):
+ The device to place the 4D attention mask on.
+ cache_position (`torch.Tensor`):
+ Indices depicting the position of the input sequence tokens in the sequence.
+ batch_size (`torch.Tensor`):
+ Batch size.
+ """
+ if attention_mask is not None and attention_mask.dim() == 4:
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+ causal_mask = attention_mask
+ else:
+ min_dtype = torch.finfo(dtype).min
+ causal_mask = torch.full(
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+ )
+ if sequence_length != 1:
+ causal_mask = torch.triu(causal_mask, diagonal=1)
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+ if attention_mask is not None:
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
+ mask_length = attention_mask.shape[-1]
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+ causal_mask.device
+ )
+ padding_mask = padding_mask == 0
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+ padding_mask, min_dtype
+ )
+
+ return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["lm_head.weight"]
+ _tp_plan = {"lm_head": "colwise_rep"}
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = DeepseekV3Model(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @can_return_tuple
+ @add_start_docstrings_to_model_forward(DEEPSEEK_V3_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ **kwargs: Unpack[KwargsForCausalLM],
+ ) -> CausalLMOutputWithPast:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM
+
+ >>> model = DeepseekV3ForCausalLM.from_pretrained("meta-deepseek_v3/DeepseekV3-2-7b-hf")
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-deepseek_v3/DeepseekV3-2-7b-hf")
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs: BaseModelOutputWithPast = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ cache_position=cache_position,
+ **kwargs,
+ )
+
+ hidden_states = outputs.last_hidden_state
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = ["DeepseekV3PreTrainedModel", "DeepseekV3Model", "DeepseekV3ForCausalLM"]
diff --git a/docs/transformers/build/lib/transformers/models/deepseek_v3/modular_deepseek_v3.py b/docs/transformers/build/lib/transformers/models/deepseek_v3/modular_deepseek_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4905c62011adc387eb03e890cb0a3378bc445c0
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deepseek_v3/modular_deepseek_v3.py
@@ -0,0 +1,368 @@
+import math
+from typing import Callable, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import logging
+from ..llama.modeling_llama import (
+ LlamaDecoderLayer,
+ LlamaForCausalLM,
+ LlamaModel,
+ LlamaPreTrainedModel,
+ LlamaRMSNorm,
+ LlamaRotaryEmbedding,
+ apply_rotary_pos_emb,
+ eager_attention_forward,
+ rotate_half,
+)
+from .configuration_deepseek_v3 import DeepseekV3Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekV3RMSNorm(LlamaRMSNorm):
+ pass
+
+
+class DeepseekV3RotaryEmbedding(LlamaRotaryEmbedding):
+ pass
+
+
+def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ r"""
+ TODO let's just use the original freqcis computation to not have the view
+ transpose + reshape! This is not optimized!
+ Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`):
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+ used to pass offsetted position ids when working with a KV-cache.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+
+ b, h, s, d = q.shape
+ q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ b, h, s, d = k.shape
+ k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+def yarn_get_mscale(scale=1, mscale=1):
+ if scale <= 1:
+ return 1.0
+ return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV3MLP(nn.Module):
+ def __init__(self, config, hidden_size=None, intermediate_size=None):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+ self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+ return down_proj
+
+
+class DeepseekV3TopkRouter(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.top_k = config.num_experts_per_tok
+ self.n_routed_experts = config.n_routed_experts
+ self.routed_scaling_factor = config.routed_scaling_factor
+ self.n_group = config.n_group
+ self.topk_group = config.topk_group
+ self.norm_topk_prob = config.norm_topk_prob
+
+ self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
+ self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts)))
+
+ @torch.no_grad()
+ def get_topk_indices(self, scores):
+ scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
+ group_scores = (
+ scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
+ .topk(2, dim=-1)[0]
+ .sum(dim=-1)
+ )
+ group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+ group_mask = torch.zeros_like(group_scores)
+ group_mask.scatter_(1, group_idx, 1)
+ score_mask = (
+ group_mask.unsqueeze(-1)
+ .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
+ .reshape(-1, self.n_routed_experts)
+ )
+ scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
+ topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
+ return topk_indices
+
+ def forward(self, hidden_states):
+ hidden_states = hidden_states.view(-1, self.config.hidden_size)
+ router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
+ scores = router_logits.sigmoid()
+ topk_indices = self.get_topk_indices(scores)
+ topk_weights = scores.gather(1, topk_indices)
+ if self.norm_topk_prob:
+ denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+ topk_weights /= denominator
+ topk_weights = topk_weights * self.routed_scaling_factor
+ return topk_indices, topk_weights
+
+
+class DeepseekV3MoE(nn.Module):
+ """
+ A mixed expert module containing shared experts.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.experts = nn.ModuleList(
+ [
+ DeepseekV3MLP(config, intermediate_size=config.moe_intermediate_size)
+ for _ in range(config.n_routed_experts)
+ ]
+ )
+ self.gate = DeepseekV3TopkRouter(config)
+ self.shared_experts = DeepseekV3MLP(
+ config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
+ )
+
+ def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+ r"""
+ CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
+ to not have to do a loop here (deepseek has 256 experts soooo yeah).
+ """
+ final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+ expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+ expert_mask = expert_mask.permute(2, 0, 1)
+
+ for expert_idx in range(len(self.experts)):
+ expert = self.experts[expert_idx]
+ mask = expert_mask[expert_idx]
+ token_indices, weight_indices = torch.where(mask)
+
+ if token_indices.numel() > 0:
+ expert_weights = topk_weights[token_indices, weight_indices]
+ expert_input = hidden_states[token_indices]
+ expert_output = expert(expert_input)
+ weighted_output = expert_output * expert_weights.unsqueeze(-1)
+ final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+ # in original deepseek, the output of the experts are gathered once we leave this module
+ # thus the moe module is itelsf an IsolatedParallel module
+ # and all expert are "local" meaning we shard but we don't gather
+ return final_hidden_states.type(hidden_states.dtype)
+
+ def forward(self, hidden_states):
+ residuals = hidden_states
+ orig_shape = hidden_states.shape
+ topk_indices, topk_weights = self.gate(hidden_states)
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+ hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+ hidden_states = hidden_states + self.shared_experts(residuals)
+ return hidden_states
+
+
+class DeepseekV3Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: DeepseekV3Config, layer_idx: int):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+ self.attention_dropout = config.attention_dropout
+ self.num_heads = config.num_attention_heads
+ self.rope_theta = config.rope_theta
+ self.q_lora_rank = config.q_lora_rank
+ self.qk_rope_head_dim = config.qk_rope_head_dim
+ self.kv_lora_rank = config.kv_lora_rank
+ self.v_head_dim = config.v_head_dim
+ self.qk_nope_head_dim = config.qk_nope_head_dim
+ self.qk_head_dim = config.qk_head_dim
+
+ self.is_causal = True
+ self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
+ self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
+ self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+
+ self.kv_a_proj_with_mqa = nn.Linear(
+ config.hidden_size,
+ self.kv_lora_rank + self.qk_rope_head_dim,
+ bias=config.attention_bias,
+ )
+ self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank)
+ self.kv_b_proj = nn.Linear(
+ self.kv_lora_rank,
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+ bias=False,
+ )
+
+ self.o_proj = nn.Linear(
+ self.num_heads * self.v_head_dim,
+ config.hidden_size,
+ bias=config.attention_bias,
+ )
+
+ self.scaling = self.qk_head_dim ** (-0.5)
+ if self.config.rope_scaling is not None:
+ mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+ scaling_factor = self.config.rope_scaling["factor"]
+ if mscale_all_dim:
+ mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+ self.scaling = self.scaling * mscale * mscale
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+ attention_mask: Optional[torch.Tensor],
+ past_key_value: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Unpack[FlashAttentionKwargs],
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ batch_size, seq_length = hidden_states.shape[:-1]
+ query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+ key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+
+ q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))).view(query_shape).transpose(1, 2)
+ q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+ compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+ k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+ k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
+ k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+ k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+
+ cos, sin = position_embeddings
+ if self.config.rope_interleave: # support using interleaved weights for efficiency
+ q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
+ else:
+ q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
+ k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+
+ query_states = torch.cat((q_pass, q_rot), dim=-1)
+ key_states = torch.cat((k_pass, k_rot), dim=-1)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+ value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+
+ attention_interface: Callable = eager_attention_forward
+ if self.config._attn_implementation != "eager":
+ if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+ logger.warning_once(
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ else:
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+ attn_output, attn_weights = attention_interface(
+ self,
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ dropout=0.0 if not self.training else self.attention_dropout,
+ scaling=self.scaling,
+ **kwargs,
+ )
+
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+ attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+ attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+ attn_output = self.o_proj(attn_output)
+ return attn_output, attn_weights
+
+
+class DeepseekV3DecoderLayer(LlamaDecoderLayer, nn.Module):
+ def __init__(self, config: DeepseekV3Config, layer_idx: int):
+ nn.Module().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.self_attn = DeepseekV3Attention(config=config, layer_idx=layer_idx)
+
+ if layer_idx >= config.first_k_dense_replace:
+ self.mlp = DeepseekV3MoE(config)
+ else:
+ self.mlp = DeepseekV3MLP(config)
+
+ self.input_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class DeepseekV3PreTrainedModel(LlamaPreTrainedModel):
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, DeepseekV3RMSNorm):
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, DeepseekV3TopkRouter):
+ module.weight.data.normal_(mean=0.0, std=std)
+
+
+class DeepseekV3Model(LlamaModel):
+ _keys_to_ignore_on_load_unexpected = [r"model\.layers\.61.*"]
+
+
+class DeepseekV3ForCausalLM(LlamaForCausalLM):
+ pass
+
+
+__all__ = [
+ "DeepseekV3PreTrainedModel",
+ "DeepseekV3Model",
+ "DeepseekV3ForCausalLM",
+]
diff --git a/docs/transformers/build/lib/transformers/models/deformable_detr/__init__.py b/docs/transformers/build/lib/transformers/models/deformable_detr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..16a1959c30ff5474ed82a6b0a2e22896514d6a44
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deformable_detr/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_deformable_detr import *
+ from .feature_extraction_deformable_detr import *
+ from .image_processing_deformable_detr import *
+ from .image_processing_deformable_detr_fast import *
+ from .modeling_deformable_detr import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/deformable_detr/configuration_deformable_detr.py b/docs/transformers/build/lib/transformers/models/deformable_detr/configuration_deformable_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..05bd0f906a110820c36f12fb75b73c1fd0927006
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -0,0 +1,282 @@
+# coding=utf-8
+# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Deformable DETR model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeformableDetrConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DeformableDetrModel`]. It is used to instantiate
+ a Deformable DETR model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the Deformable DETR
+ [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ use_timm_backbone (`bool`, *optional*, defaults to `True`):
+ Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+ API.
+ backbone_config (`PretrainedConfig` or `dict`, *optional*):
+ The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+ case it will default to `ResNetConfig()`.
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ num_queries (`int`, *optional*, defaults to 300):
+ Number of object queries, i.e. detection slots. This is the maximal number of objects
+ [`DeformableDetrModel`] can detect in a single image. In case `two_stage` is set to `True`, we use
+ `two_stage_num_proposals` instead.
+ d_model (`int`, *optional*, defaults to 256):
+ Dimension of the layers.
+ encoder_layers (`int`, *optional*, defaults to 6):
+ Number of encoder layers.
+ decoder_layers (`int`, *optional*, defaults to 6):
+ Number of decoder layers.
+ encoder_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ decoder_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer decoder.
+ decoder_ffn_dim (`int`, *optional*, defaults to 1024):
+ Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+ encoder_ffn_dim (`int`, *optional*, defaults to 1024):
+ Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+ activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ activation_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for activations inside the fully connected layer.
+ init_std (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ init_xavier_std (`float`, *optional*, defaults to 1):
+ The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+ for more details.
+ auxiliary_loss (`bool`, *optional*, defaults to `False`):
+ Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+ position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+ Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+ backbone (`str`, *optional*, defaults to `"resnet50"`):
+ Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+ will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+ is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+ use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+ Whether to use pretrained weights for the backbone.
+ backbone_kwargs (`dict`, *optional*):
+ Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+ e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+ dilation (`bool`, *optional*, defaults to `False`):
+ Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+ `use_timm_backbone` = `True`.
+ class_cost (`float`, *optional*, defaults to 1):
+ Relative weight of the classification error in the Hungarian matching cost.
+ bbox_cost (`float`, *optional*, defaults to 5):
+ Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+ giou_cost (`float`, *optional*, defaults to 2):
+ Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+ mask_loss_coefficient (`float`, *optional*, defaults to 1):
+ Relative weight of the Focal loss in the panoptic segmentation loss.
+ dice_loss_coefficient (`float`, *optional*, defaults to 1):
+ Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+ bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+ Relative weight of the L1 bounding box loss in the object detection loss.
+ giou_loss_coefficient (`float`, *optional*, defaults to 2):
+ Relative weight of the generalized IoU loss in the object detection loss.
+ eos_coefficient (`float`, *optional*, defaults to 0.1):
+ Relative classification weight of the 'no-object' class in the object detection loss.
+ num_feature_levels (`int`, *optional*, defaults to 4):
+ The number of input feature levels.
+ encoder_n_points (`int`, *optional*, defaults to 4):
+ The number of sampled keys in each feature level for each attention head in the encoder.
+ decoder_n_points (`int`, *optional*, defaults to 4):
+ The number of sampled keys in each feature level for each attention head in the decoder.
+ two_stage (`bool`, *optional*, defaults to `False`):
+ Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
+ Deformable DETR, which are further fed into the decoder for iterative bounding box refinement.
+ two_stage_num_proposals (`int`, *optional*, defaults to 300):
+ The number of region proposals to be generated, in case `two_stage` is set to `True`.
+ with_box_refine (`bool`, *optional*, defaults to `False`):
+ Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+ based on the predictions from the previous layer.
+ focal_alpha (`float`, *optional*, defaults to 0.25):
+ Alpha parameter in the focal loss.
+ disable_custom_kernels (`bool`, *optional*, defaults to `False`):
+ Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
+ kernels are not supported by PyTorch ONNX export.
+
+ Examples:
+
+ ```python
+ >>> from transformers import DeformableDetrConfig, DeformableDetrModel
+
+ >>> # Initializing a Deformable DETR SenseTime/deformable-detr style configuration
+ >>> configuration = DeformableDetrConfig()
+
+ >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
+ >>> model = DeformableDetrModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "deformable_detr"
+ attribute_map = {
+ "hidden_size": "d_model",
+ "num_attention_heads": "encoder_attention_heads",
+ }
+
+ def __init__(
+ self,
+ use_timm_backbone=True,
+ backbone_config=None,
+ num_channels=3,
+ num_queries=300,
+ max_position_embeddings=1024,
+ encoder_layers=6,
+ encoder_ffn_dim=1024,
+ encoder_attention_heads=8,
+ decoder_layers=6,
+ decoder_ffn_dim=1024,
+ decoder_attention_heads=8,
+ encoder_layerdrop=0.0,
+ is_encoder_decoder=True,
+ activation_function="relu",
+ d_model=256,
+ dropout=0.1,
+ attention_dropout=0.0,
+ activation_dropout=0.0,
+ init_std=0.02,
+ init_xavier_std=1.0,
+ return_intermediate=True,
+ auxiliary_loss=False,
+ position_embedding_type="sine",
+ backbone="resnet50",
+ use_pretrained_backbone=True,
+ backbone_kwargs=None,
+ dilation=False,
+ num_feature_levels=4,
+ encoder_n_points=4,
+ decoder_n_points=4,
+ two_stage=False,
+ two_stage_num_proposals=300,
+ with_box_refine=False,
+ class_cost=1,
+ bbox_cost=5,
+ giou_cost=2,
+ mask_loss_coefficient=1,
+ dice_loss_coefficient=1,
+ bbox_loss_coefficient=5,
+ giou_loss_coefficient=2,
+ eos_coefficient=0.1,
+ focal_alpha=0.25,
+ disable_custom_kernels=False,
+ **kwargs,
+ ):
+ # We default to values which were previously hard-coded in the model. This enables configurability of the config
+ # while keeping the default behavior the same.
+ if use_timm_backbone and backbone_kwargs is None:
+ backbone_kwargs = {}
+ if dilation:
+ backbone_kwargs["output_stride"] = 16
+ backbone_kwargs["out_indices"] = [2, 3, 4] if num_feature_levels > 1 else [4]
+ backbone_kwargs["in_chans"] = num_channels
+ # Backwards compatibility
+ elif not use_timm_backbone and backbone in (None, "resnet50"):
+ if backbone_config is None:
+ logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+ backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+ elif isinstance(backbone_config, dict):
+ backbone_model_type = backbone_config.get("model_type")
+ config_class = CONFIG_MAPPING[backbone_model_type]
+ backbone_config = config_class.from_dict(backbone_config)
+
+ verify_backbone_config_arguments(
+ use_timm_backbone=use_timm_backbone,
+ use_pretrained_backbone=use_pretrained_backbone,
+ backbone=backbone,
+ backbone_config=backbone_config,
+ backbone_kwargs=backbone_kwargs,
+ )
+
+ self.use_timm_backbone = use_timm_backbone
+ self.backbone_config = backbone_config
+ self.num_channels = num_channels
+ self.num_queries = num_queries
+ self.max_position_embeddings = max_position_embeddings
+ self.d_model = d_model
+ self.encoder_ffn_dim = encoder_ffn_dim
+ self.encoder_layers = encoder_layers
+ self.encoder_attention_heads = encoder_attention_heads
+ self.decoder_ffn_dim = decoder_ffn_dim
+ self.decoder_layers = decoder_layers
+ self.decoder_attention_heads = decoder_attention_heads
+ self.dropout = dropout
+ self.attention_dropout = attention_dropout
+ self.activation_dropout = activation_dropout
+ self.activation_function = activation_function
+ self.init_std = init_std
+ self.init_xavier_std = init_xavier_std
+ self.encoder_layerdrop = encoder_layerdrop
+ self.auxiliary_loss = auxiliary_loss
+ self.position_embedding_type = position_embedding_type
+ self.backbone = backbone
+ self.use_pretrained_backbone = use_pretrained_backbone
+ self.backbone_kwargs = backbone_kwargs
+ self.dilation = dilation
+ # deformable attributes
+ self.num_feature_levels = num_feature_levels
+ self.encoder_n_points = encoder_n_points
+ self.decoder_n_points = decoder_n_points
+ self.two_stage = two_stage
+ self.two_stage_num_proposals = two_stage_num_proposals
+ self.with_box_refine = with_box_refine
+ if two_stage is True and with_box_refine is False:
+ raise ValueError("If two_stage is True, with_box_refine must be True.")
+ # Hungarian matcher
+ self.class_cost = class_cost
+ self.bbox_cost = bbox_cost
+ self.giou_cost = giou_cost
+ # Loss coefficients
+ self.mask_loss_coefficient = mask_loss_coefficient
+ self.dice_loss_coefficient = dice_loss_coefficient
+ self.bbox_loss_coefficient = bbox_loss_coefficient
+ self.giou_loss_coefficient = giou_loss_coefficient
+ self.eos_coefficient = eos_coefficient
+ self.focal_alpha = focal_alpha
+ self.disable_custom_kernels = disable_custom_kernels
+ super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+ @property
+ def num_attention_heads(self) -> int:
+ return self.encoder_attention_heads
+
+ @property
+ def hidden_size(self) -> int:
+ return self.d_model
+
+
+__all__ = ["DeformableDetrConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/docs/transformers/build/lib/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88582eaccfd1ce64ef24341497f247434b9b562
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
@@ -0,0 +1,236 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Deformable DETR checkpoints."""
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def rename_key(orig_key):
+ if "backbone.0.body" in orig_key:
+ orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model")
+ if "transformer" in orig_key:
+ orig_key = orig_key.replace("transformer.", "")
+ if "norm1" in orig_key:
+ if "encoder" in orig_key:
+ orig_key = orig_key.replace("norm1", "self_attn_layer_norm")
+ else:
+ orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm")
+ if "norm2" in orig_key:
+ if "encoder" in orig_key:
+ orig_key = orig_key.replace("norm2", "final_layer_norm")
+ else:
+ orig_key = orig_key.replace("norm2", "self_attn_layer_norm")
+ if "norm3" in orig_key:
+ orig_key = orig_key.replace("norm3", "final_layer_norm")
+ if "linear1" in orig_key:
+ orig_key = orig_key.replace("linear1", "fc1")
+ if "linear2" in orig_key:
+ orig_key = orig_key.replace("linear2", "fc2")
+ if "query_embed" in orig_key:
+ orig_key = orig_key.replace("query_embed", "query_position_embeddings")
+ if "cross_attn" in orig_key:
+ orig_key = orig_key.replace("cross_attn", "encoder_attn")
+
+ return orig_key
+
+
+def read_in_q_k_v(state_dict):
+ # transformer decoder self-attention layers
+ for i in range(6):
+ # read in weights + bias of input projection layer of self-attention
+ in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight")
+ in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias")
+ # next, add query, keys and values (in that order) to the state dict
+ state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+ state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+ state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+ state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+ state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+ state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ im = Image.open(requests.get(url, stream=True).raw)
+
+ return im
+
+
+@torch.no_grad()
+def convert_deformable_detr_checkpoint(
+ checkpoint_path,
+ single_scale,
+ dilation,
+ with_box_refine,
+ two_stage,
+ pytorch_dump_folder_path,
+ push_to_hub,
+):
+ """
+ Copy/paste/tweak model's weights to our Deformable DETR structure.
+ """
+
+ # load default config
+ config = DeformableDetrConfig()
+ # set config attributes
+ if single_scale:
+ config.num_feature_levels = 1
+ config.dilation = dilation
+ config.with_box_refine = with_box_refine
+ config.two_stage = two_stage
+ # set labels
+ config.num_labels = 91
+ repo_id = "huggingface/label-files"
+ filename = "coco-detection-id2label.json"
+ id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
+ id2label = {int(k): v for k, v in id2label.items()}
+ config.id2label = id2label
+ config.label2id = {v: k for k, v in id2label.items()}
+
+ # load image processor
+ image_processor = DeformableDetrImageProcessor(format="coco_detection")
+
+ # prepare image
+ img = prepare_img()
+ encoding = image_processor(images=img, return_tensors="pt")
+ pixel_values = encoding["pixel_values"]
+
+ logger.info("Converting model...")
+
+ # load original state dict
+ state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
+ # rename keys
+ for key in state_dict.copy().keys():
+ val = state_dict.pop(key)
+ state_dict[rename_key(key)] = val
+ # query, key and value matrices need special treatment
+ read_in_q_k_v(state_dict)
+ # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+ prefix = "model."
+ for key in state_dict.copy().keys():
+ if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
+ val = state_dict.pop(key)
+ state_dict[prefix + key] = val
+ # finally, create HuggingFace model and load state dict
+ model = DeformableDetrForObjectDetection(config)
+ model.load_state_dict(state_dict)
+ model.eval()
+
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ model.to(device)
+ # verify our conversion
+ outputs = model(pixel_values.to(device))
+
+ expected_logits = torch.tensor(
+ [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
+ )
+ expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]])
+
+ if single_scale:
+ expected_logits = torch.tensor(
+ [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
+ )
+ expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]])
+
+ if single_scale and dilation:
+ expected_logits = torch.tensor(
+ [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]]
+ )
+ expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]])
+
+ if with_box_refine:
+ expected_logits = torch.tensor(
+ [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]]
+ )
+ expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]])
+
+ if with_box_refine and two_stage:
+ expected_logits = torch.tensor(
+ [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
+ )
+ expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]])
+
+ print("Logits:", outputs.logits[0, :3, :3])
+
+ assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
+ assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
+
+ print("Everything ok!")
+
+ # Save model and image processor
+ logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
+ Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+ model.save_pretrained(pytorch_dump_folder_path)
+ image_processor.save_pretrained(pytorch_dump_folder_path)
+
+ # Push to hub
+ if push_to_hub:
+ model_name = "deformable-detr"
+ model_name += "-single-scale" if single_scale else ""
+ model_name += "-dc5" if dilation else ""
+ model_name += "-with-box-refine" if with_box_refine else ""
+ model_name += "-two-stage" if two_stage else ""
+ print("Pushing model to hub...")
+ model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument(
+ "--checkpoint_path",
+ type=str,
+ default="/home/niels/checkpoints/deformable_detr/r50_deformable_detr-checkpoint.pth",
+ help="Path to Pytorch checkpoint (.pth file) you'd like to convert.",
+ )
+ parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.")
+ parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.")
+ parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.")
+ parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.")
+ parser.add_argument(
+ "--pytorch_dump_folder_path",
+ default=None,
+ type=str,
+ required=True,
+ help="Path to the folder to output PyTorch model.",
+ )
+ parser.add_argument(
+ "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+ )
+ args = parser.parse_args()
+ convert_deformable_detr_checkpoint(
+ args.checkpoint_path,
+ args.single_scale,
+ args.dilation,
+ args.with_box_refine,
+ args.two_stage,
+ args.pytorch_dump_folder_path,
+ args.push_to_hub,
+ )
diff --git a/docs/transformers/build/lib/transformers/models/deformable_detr/feature_extraction_deformable_detr.py b/docs/transformers/build/lib/transformers/models/deformable_detr/feature_extraction_deformable_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..e349ca3db0ca92b349752484877ad8dd64d153b1
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deformable_detr/feature_extraction_deformable_detr.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for Deformable DETR."""
+
+import warnings
+
+from ...image_transforms import rgb_to_id as _rgb_to_id
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_deformable_detr import DeformableDetrImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+def rgb_to_id(x):
+ warnings.warn(
+ "rgb_to_id has moved and will not be importable from this module from v5. "
+ "Please import from transformers.image_transforms instead.",
+ FutureWarning,
+ )
+ return _rgb_to_id(x)
+
+
+@requires(backends=("vision",))
+class DeformableDetrFeatureExtractor(DeformableDetrImageProcessor):
+ def __init__(self, *args, **kwargs) -> None:
+ warnings.warn(
+ "The class DeformableDetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+ " Please use DeformableDetrImageProcessor instead.",
+ FutureWarning,
+ )
+ super().__init__(*args, **kwargs)
+
+
+__all__ = ["DeformableDetrFeatureExtractor"]
diff --git a/docs/transformers/build/lib/transformers/models/deformable_detr/image_processing_deformable_detr.py b/docs/transformers/build/lib/transformers/models/deformable_detr/image_processing_deformable_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ad8a14997807f653e564b79bbec1f48a17837c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -0,0 +1,1633 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Deformable DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
+ PaddingMode,
+ center_to_corners_format,
+ corners_to_center_format,
+ id_to_rgb,
+ pad,
+ rescale,
+ resize,
+ rgb_to_id,
+ to_channel_dimension_format,
+)
+from ...image_utils import (
+ IMAGENET_DEFAULT_MEAN,
+ IMAGENET_DEFAULT_STD,
+ AnnotationFormat,
+ AnnotationType,
+ ChannelDimension,
+ ImageInput,
+ PILImageResampling,
+ get_image_size,
+ infer_channel_dimension_format,
+ is_scaled_image,
+ make_list_of_images,
+ to_numpy_array,
+ valid_images,
+ validate_annotations,
+ validate_kwargs,
+ validate_preprocess_arguments,
+)
+from ...utils import (
+ TensorType,
+ is_flax_available,
+ is_jax_tensor,
+ is_scipy_available,
+ is_tf_available,
+ is_tf_tensor,
+ is_torch_available,
+ is_torch_tensor,
+ is_vision_available,
+ logging,
+)
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+ import torch
+ from torch import nn
+
+
+if is_vision_available():
+ import PIL
+
+if is_scipy_available():
+ import scipy.special
+ import scipy.stats
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
+ """
+ Computes the output image size given the input image size and the desired output size.
+
+ Args:
+ image_size (`Tuple[int, int]`):
+ The input image size.
+ size (`int`):
+ The desired output size.
+ max_size (`int`, *optional*):
+ The maximum allowed output size.
+ """
+ height, width = image_size
+ raw_size = None
+ if max_size is not None:
+ min_original_size = float(min((height, width)))
+ max_original_size = float(max((height, width)))
+ if max_original_size / min_original_size * size > max_size:
+ raw_size = max_size * min_original_size / max_original_size
+ size = int(round(raw_size))
+
+ if (height <= width and height == size) or (width <= height and width == size):
+ oh, ow = height, width
+ elif width < height:
+ ow = size
+ if max_size is not None and raw_size is not None:
+ oh = int(raw_size * height / width)
+ else:
+ oh = int(size * height / width)
+ else:
+ oh = size
+ if max_size is not None and raw_size is not None:
+ ow = int(raw_size * width / height)
+ else:
+ ow = int(size * width / height)
+
+ return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
+def get_resize_output_image_size(
+ input_image: np.ndarray,
+ size: Union[int, Tuple[int, int], List[int]],
+ max_size: Optional[int] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+ """
+ Computes the output image size given the input image size and the desired output size. If the desired output size
+ is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+ image size is computed by keeping the aspect ratio of the input image size.
+
+ Args:
+ input_image (`np.ndarray`):
+ The image to resize.
+ size (`int` or `Tuple[int, int]` or `List[int]`):
+ The desired output size.
+ max_size (`int`, *optional*):
+ The maximum allowed output size.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+ """
+ image_size = get_image_size(input_image, input_data_format)
+ if isinstance(size, (list, tuple)):
+ return size
+
+ return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
+def get_image_size_for_max_height_width(
+ input_image: np.ndarray,
+ max_height: int,
+ max_width: int,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+ """
+ Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+ Important, even if image_height < max_height and image_width < max_width, the image will be resized
+ to at least one of the edges be equal to max_height or max_width.
+
+ For example:
+ - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+ - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+
+ Args:
+ input_image (`np.ndarray`):
+ The image to resize.
+ max_height (`int`):
+ The maximum allowed height.
+ max_width (`int`):
+ The maximum allowed width.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+ """
+ image_size = get_image_size(input_image, input_data_format)
+ height, width = image_size
+ height_scale = max_height / height
+ width_scale = max_width / width
+ min_scale = min(height_scale, width_scale)
+ new_height = int(height * min_scale)
+ new_width = int(width * min_scale)
+ return new_height, new_width
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
+def get_numpy_to_framework_fn(arr) -> Callable:
+ """
+ Returns a function that converts a numpy array to the framework of the input array.
+
+ Args:
+ arr (`np.ndarray`): The array to convert.
+ """
+ if isinstance(arr, np.ndarray):
+ return np.array
+ if is_tf_available() and is_tf_tensor(arr):
+ import tensorflow as tf
+
+ return tf.convert_to_tensor
+ if is_torch_available() and is_torch_tensor(arr):
+ import torch
+
+ return torch.tensor
+ if is_flax_available() and is_jax_tensor(arr):
+ import jax.numpy as jnp
+
+ return jnp.array
+ raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+ """
+ Squeezes an array, but only if the axis specified has dim 1.
+ """
+ if axis is None:
+ return arr.squeeze()
+
+ try:
+ return arr.squeeze(axis=axis)
+ except ValueError:
+ return arr
+
+
+# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
+def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+ image_height, image_width = image_size
+ norm_annotation = {}
+ for key, value in annotation.items():
+ if key == "boxes":
+ boxes = value
+ boxes = corners_to_center_format(boxes)
+ boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+ norm_annotation[key] = boxes
+ else:
+ norm_annotation[key] = value
+ return norm_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+ """
+ Return the maximum value across all indices of an iterable of values.
+ """
+ return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+ images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+ """
+ Get the maximum height and width across all images in a batch.
+ """
+ if input_data_format is None:
+ input_data_format = infer_channel_dimension_format(images[0])
+
+ if input_data_format == ChannelDimension.FIRST:
+ _, max_height, max_width = max_across_indices([img.shape for img in images])
+ elif input_data_format == ChannelDimension.LAST:
+ max_height, max_width, _ = max_across_indices([img.shape for img in images])
+ else:
+ raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+ return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+ image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+ """
+ Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+ Args:
+ image (`np.ndarray`):
+ Image to make the pixel mask for.
+ output_size (`Tuple[int, int]`):
+ Output size of the mask.
+ """
+ input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+ mask = np.zeros(output_size, dtype=np.int64)
+ mask[:input_height, :input_width] = 1
+ return mask
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
+def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
+ """
+ Convert a COCO polygon annotation to a mask.
+
+ Args:
+ segmentations (`List[List[float]]`):
+ List of polygons, each polygon represented by a list of x-y coordinates.
+ height (`int`):
+ Height of the mask.
+ width (`int`):
+ Width of the mask.
+ """
+ try:
+ from pycocotools import mask as coco_mask
+ except ImportError:
+ raise ImportError("Pycocotools is not installed in your environment.")
+
+ masks = []
+ for polygons in segmentations:
+ rles = coco_mask.frPyObjects(polygons, height, width)
+ mask = coco_mask.decode(rles)
+ if len(mask.shape) < 3:
+ mask = mask[..., None]
+ mask = np.asarray(mask, dtype=np.uint8)
+ mask = np.any(mask, axis=2)
+ masks.append(mask)
+ if masks:
+ masks = np.stack(masks, axis=0)
+ else:
+ masks = np.zeros((0, height, width), dtype=np.uint8)
+
+ return masks
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DeformableDetr
+def prepare_coco_detection_annotation(
+ image,
+ target,
+ return_segmentation_masks: bool = False,
+ input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+ """
+ Convert the target in COCO format into the format expected by DeformableDetr.
+ """
+ image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+ image_id = target["image_id"]
+ image_id = np.asarray([image_id], dtype=np.int64)
+
+ # Get all COCO annotations for the given image.
+ annotations = target["annotations"]
+ annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+ classes = [obj["category_id"] for obj in annotations]
+ classes = np.asarray(classes, dtype=np.int64)
+
+ # for conversion to coco api
+ area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+ iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
+
+ boxes = [obj["bbox"] for obj in annotations]
+ # guard against no boxes via resizing
+ boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+ boxes[:, 2:] += boxes[:, :2]
+ boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+ boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+ keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+ new_target = {}
+ new_target["image_id"] = image_id
+ new_target["class_labels"] = classes[keep]
+ new_target["boxes"] = boxes[keep]
+ new_target["area"] = area[keep]
+ new_target["iscrowd"] = iscrowd[keep]
+ new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+ if annotations and "keypoints" in annotations[0]:
+ keypoints = [obj["keypoints"] for obj in annotations]
+ # Converting the filtered keypoints list to a numpy array
+ keypoints = np.asarray(keypoints, dtype=np.float32)
+ # Apply the keep mask here to filter the relevant annotations
+ keypoints = keypoints[keep]
+ num_keypoints = keypoints.shape[0]
+ keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+ new_target["keypoints"] = keypoints
+
+ if return_segmentation_masks:
+ segmentation_masks = [obj["segmentation"] for obj in annotations]
+ masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
+ new_target["masks"] = masks[keep]
+
+ return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes
+def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
+ """
+ Compute the bounding boxes around the provided panoptic segmentation masks.
+
+ Args:
+ masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+ Returns:
+ boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+ """
+ if masks.size == 0:
+ return np.zeros((0, 4))
+
+ h, w = masks.shape[-2:]
+ y = np.arange(0, h, dtype=np.float32)
+ x = np.arange(0, w, dtype=np.float32)
+ # see https://github.com/pytorch/pytorch/issues/50276
+ y, x = np.meshgrid(y, x, indexing="ij")
+
+ x_mask = masks * np.expand_dims(x, axis=0)
+ x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+ x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+ x_min = x.filled(fill_value=1e8)
+ x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+ y_mask = masks * np.expand_dims(y, axis=0)
+ y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+ y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+ y_min = y.filled(fill_value=1e8)
+ y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+ return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DeformableDetr
+def prepare_coco_panoptic_annotation(
+ image: np.ndarray,
+ target: Dict,
+ masks_path: Union[str, pathlib.Path],
+ return_masks: bool = True,
+ input_data_format: Union[ChannelDimension, str] = None,
+) -> Dict:
+ """
+ Prepare a coco panoptic annotation for DeformableDetr.
+ """
+ image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+ annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+ new_target = {}
+ new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
+ new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
+ new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
+
+ if "segments_info" in target:
+ masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
+ masks = rgb_to_id(masks)
+
+ ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
+ masks = masks == ids[:, None, None]
+ masks = masks.astype(np.uint8)
+ if return_masks:
+ new_target["masks"] = masks
+ new_target["boxes"] = masks_to_boxes(masks)
+ new_target["class_labels"] = np.array(
+ [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
+ )
+ new_target["iscrowd"] = np.asarray(
+ [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
+ )
+ new_target["area"] = np.asarray(
+ [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
+ )
+
+ return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image
+def get_segmentation_image(
+ masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
+):
+ h, w = input_size
+ final_h, final_w = target_size
+
+ m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
+
+ if m_id.shape[-1] == 0:
+ # We didn't detect any mask :(
+ m_id = np.zeros((h, w), dtype=np.int64)
+ else:
+ m_id = m_id.argmax(-1).reshape(h, w)
+
+ if deduplicate:
+ # Merge the masks corresponding to the same stuff class
+ for equiv in stuff_equiv_classes.values():
+ for eq_id in equiv:
+ m_id[m_id == eq_id] = equiv[0]
+
+ seg_img = id_to_rgb(m_id)
+ seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
+ return seg_img
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_mask_area
+def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray:
+ final_h, final_w = target_size
+ np_seg_img = seg_img.astype(np.uint8)
+ np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
+ m_id = rgb_to_id(np_seg_img)
+ area = [(m_id == i).sum() for i in range(n_classes)]
+ return area
+
+
+# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
+def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ probs = scipy.special.softmax(logits, axis=-1)
+ labels = probs.argmax(-1, keepdims=True)
+ scores = np.take_along_axis(probs, labels, axis=-1)
+ scores, labels = scores.squeeze(-1), labels.squeeze(-1)
+ return scores, labels
+
+
+# Copied from transformers.models.detr.image_processing_detr.post_process_panoptic_sample
+def post_process_panoptic_sample(
+ out_logits: np.ndarray,
+ masks: np.ndarray,
+ boxes: np.ndarray,
+ processed_size: Tuple[int, int],
+ target_size: Tuple[int, int],
+ is_thing_map: Dict,
+ threshold=0.85,
+) -> Dict:
+ """
+ Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample.
+
+ Args:
+ out_logits (`torch.Tensor`):
+ The logits for this sample.
+ masks (`torch.Tensor`):
+ The predicted segmentation masks for this sample.
+ boxes (`torch.Tensor`):
+ The prediced bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
+ width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
+ processed_size (`Tuple[int, int]`):
+ The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
+ after data augmentation but before batching.
+ target_size (`Tuple[int, int]`):
+ The target size of the image, `(height, width)` corresponding to the requested final size of the
+ prediction.
+ is_thing_map (`Dict`):
+ A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
+ threshold (`float`, *optional*, defaults to 0.85):
+ The threshold used to binarize the segmentation masks.
+ """
+ # we filter empty queries and detection below threshold
+ scores, labels = score_labels_from_class_probabilities(out_logits)
+ keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)
+
+ cur_scores = scores[keep]
+ cur_classes = labels[keep]
+ cur_boxes = center_to_corners_format(boxes[keep])
+
+ if len(cur_boxes) != len(cur_classes):
+ raise ValueError("Not as many boxes as there are classes")
+
+ cur_masks = masks[keep]
+ cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
+ cur_masks = safe_squeeze(cur_masks, 1)
+ b, h, w = cur_masks.shape
+
+ # It may be that we have several predicted masks for the same stuff class.
+ # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+ cur_masks = cur_masks.reshape(b, -1)
+ stuff_equiv_classes = defaultdict(list)
+ for k, label in enumerate(cur_classes):
+ if not is_thing_map[label]:
+ stuff_equiv_classes[label].append(k)
+
+ seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
+ area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))
+
+ # We filter out any mask that is too small
+ if cur_classes.size() > 0:
+ # We know filter empty masks as long as we find some
+ filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+ while filtered_small.any():
+ cur_masks = cur_masks[~filtered_small]
+ cur_scores = cur_scores[~filtered_small]
+ cur_classes = cur_classes[~filtered_small]
+ seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
+ area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
+ filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+ else:
+ cur_classes = np.ones((1, 1), dtype=np.int64)
+
+ segments_info = [
+ {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
+ for i, (cat, a) in enumerate(zip(cur_classes, area))
+ ]
+ del cur_classes
+
+ with io.BytesIO() as out:
+ PIL.Image.fromarray(seg_img).save(out, format="PNG")
+ predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+
+ return predictions
+
+
+# Copied from transformers.models.detr.image_processing_detr.resize_annotation
+def resize_annotation(
+ annotation: Dict[str, Any],
+ orig_size: Tuple[int, int],
+ target_size: Tuple[int, int],
+ threshold: float = 0.5,
+ resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+ """
+ Resizes an annotation to a target size.
+
+ Args:
+ annotation (`Dict[str, Any]`):
+ The annotation dictionary.
+ orig_size (`Tuple[int, int]`):
+ The original size of the input image.
+ target_size (`Tuple[int, int]`):
+ The target size of the image, as returned by the preprocessing `resize` step.
+ threshold (`float`, *optional*, defaults to 0.5):
+ The threshold used to binarize the segmentation masks.
+ resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+ The resampling filter to use when resizing the masks.
+ """
+ ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+ ratio_height, ratio_width = ratios
+
+ new_annotation = {}
+ new_annotation["size"] = target_size
+
+ for key, value in annotation.items():
+ if key == "boxes":
+ boxes = value
+ scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+ new_annotation["boxes"] = scaled_boxes
+ elif key == "area":
+ area = value
+ scaled_area = area * (ratio_width * ratio_height)
+ new_annotation["area"] = scaled_area
+ elif key == "masks":
+ masks = value[:, None]
+ masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+ masks = masks.astype(np.float32)
+ masks = masks[:, 0] > threshold
+ new_annotation["masks"] = masks
+ elif key == "size":
+ new_annotation["size"] = target_size
+ else:
+ new_annotation[key] = value
+
+ return new_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle
+def binary_mask_to_rle(mask):
+ """
+ Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
+
+ Args:
+ mask (`torch.Tensor` or `numpy.array`):
+ A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
+ segment_id or class_id.
+ Returns:
+ `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
+ format.
+ """
+ if is_torch_tensor(mask):
+ mask = mask.numpy()
+
+ pixels = mask.flatten()
+ pixels = np.concatenate([[0], pixels, [0]])
+ runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+ runs[1::2] -= runs[::2]
+ return list(runs)
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
+def convert_segmentation_to_rle(segmentation):
+ """
+ Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
+
+ Args:
+ segmentation (`torch.Tensor` or `numpy.array`):
+ A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
+ Returns:
+ `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
+ """
+ segment_ids = torch.unique(segmentation)
+
+ run_length_encodings = []
+ for idx in segment_ids:
+ mask = torch.where(segmentation == idx, 1, 0)
+ rle = binary_mask_to_rle(mask)
+ run_length_encodings.append(rle)
+
+ return run_length_encodings
+
+
+# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects
+def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
+ """
+ Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
+ `labels`.
+
+ Args:
+ masks (`torch.Tensor`):
+ A tensor of shape `(num_queries, height, width)`.
+ scores (`torch.Tensor`):
+ A tensor of shape `(num_queries)`.
+ labels (`torch.Tensor`):
+ A tensor of shape `(num_queries)`.
+ object_mask_threshold (`float`):
+ A number between 0 and 1 used to binarize the masks.
+ Raises:
+ `ValueError`: Raised when the first dimension doesn't match in all input tensors.
+ Returns:
+ `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
+ < `object_mask_threshold`.
+ """
+ if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
+ raise ValueError("mask, scores and labels must have the same shape!")
+
+ to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
+
+ return masks[to_keep], scores[to_keep], labels[to_keep]
+
+
+# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
+def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
+ # Get the mask associated with the k class
+ mask_k = mask_labels == k
+ mask_k_area = mask_k.sum()
+
+ # Compute the area of all the stuff in query k
+ original_area = (mask_probs[k] >= mask_threshold).sum()
+ mask_exists = mask_k_area > 0 and original_area > 0
+
+ # Eliminate disconnected tiny segments
+ if mask_exists:
+ area_ratio = mask_k_area / original_area
+ if not area_ratio.item() > overlap_mask_area_threshold:
+ mask_exists = False
+
+ return mask_exists, mask_k
+
+
+# Copied from transformers.models.detr.image_processing_detr.compute_segments
+def compute_segments(
+ mask_probs,
+ pred_scores,
+ pred_labels,
+ mask_threshold: float = 0.5,
+ overlap_mask_area_threshold: float = 0.8,
+ label_ids_to_fuse: Optional[Set[int]] = None,
+ target_size: Tuple[int, int] = None,
+):
+ height = mask_probs.shape[1] if target_size is None else target_size[0]
+ width = mask_probs.shape[2] if target_size is None else target_size[1]
+
+ segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
+ segments: List[Dict] = []
+
+ if target_size is not None:
+ mask_probs = nn.functional.interpolate(
+ mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
+ )[0]
+
+ current_segment_id = 0
+
+ # Weigh each mask by its prediction score
+ mask_probs *= pred_scores.view(-1, 1, 1)
+ mask_labels = mask_probs.argmax(0) # [height, width]
+
+ # Keep track of instances of each class
+ stuff_memory_list: Dict[str, int] = {}
+ for k in range(pred_labels.shape[0]):
+ pred_class = pred_labels[k].item()
+ should_fuse = pred_class in label_ids_to_fuse
+
+ # Check if mask exists and large enough to be a segment
+ mask_exists, mask_k = check_segment_validity(
+ mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+ )
+
+ if mask_exists:
+ if pred_class in stuff_memory_list:
+ current_segment_id = stuff_memory_list[pred_class]
+ else:
+ current_segment_id += 1
+
+ # Add current object segment to final segmentation map
+ segmentation[mask_k] = current_segment_id
+ segment_score = round(pred_scores[k].item(), 6)
+ segments.append(
+ {
+ "id": current_segment_id,
+ "label_id": pred_class,
+ "was_fused": should_fuse,
+ "score": segment_score,
+ }
+ )
+ if should_fuse:
+ stuff_memory_list[pred_class] = current_segment_id
+
+ return segmentation, segments
+
+
+@requires(backends=("torch", "vision"))
+class DeformableDetrImageProcessor(BaseImageProcessor):
+ r"""
+ Constructs a Deformable DETR image processor.
+
+ Args:
+ format (`str`, *optional*, defaults to `"coco_detection"`):
+ Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+ overridden by the `do_resize` parameter in the `preprocess` method.
+ size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+ Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+ in the `preprocess` method. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+ Resampling filter to use if resizing the image.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+ `do_rescale` parameter in the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+ `preprocess` method.
+ do_normalize:
+ Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+ `preprocess` method.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+ Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+ channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+ Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+ for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ do_convert_annotations (`bool`, *optional*, defaults to `True`):
+ Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+ bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+ Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+ do_pad (`bool`, *optional*, defaults to `True`):
+ Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+ method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+ If `pad_size` is provided, the image will be padded to the specified dimensions.
+ Otherwise, the image will be padded to the maximum height and width of the batch.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+
+ model_input_names = ["pixel_values", "pixel_mask"]
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
+ def __init__(
+ self,
+ format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+ do_resize: bool = True,
+ size: Dict[str, int] = None,
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_normalize: bool = True,
+ image_mean: Union[float, List[float]] = None,
+ image_std: Union[float, List[float]] = None,
+ do_convert_annotations: Optional[bool] = None,
+ do_pad: bool = True,
+ pad_size: Optional[Dict[str, int]] = None,
+ **kwargs,
+ ) -> None:
+ if "pad_and_return_pixel_mask" in kwargs:
+ do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+ if "max_size" in kwargs:
+ logger.warning_once(
+ "The `max_size` parameter is deprecated and will be removed in v4.26. "
+ "Please specify in `size['longest_edge'] instead`.",
+ )
+ max_size = kwargs.pop("max_size")
+ else:
+ max_size = None if size is None else 1333
+
+ size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+ size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+ # Backwards compatibility
+ if do_convert_annotations is None:
+ do_convert_annotations = do_normalize
+
+ super().__init__(**kwargs)
+ self.format = format
+ self.do_resize = do_resize
+ self.size = size
+ self.resample = resample
+ self.do_rescale = do_rescale
+ self.rescale_factor = rescale_factor
+ self.do_normalize = do_normalize
+ self.do_convert_annotations = do_convert_annotations
+ self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+ self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+ self.do_pad = do_pad
+ self.pad_size = pad_size
+ self._valid_processor_keys = [
+ "images",
+ "annotations",
+ "return_segmentation_masks",
+ "masks_path",
+ "do_resize",
+ "size",
+ "resample",
+ "do_rescale",
+ "rescale_factor",
+ "do_normalize",
+ "do_convert_annotations",
+ "image_mean",
+ "image_std",
+ "do_pad",
+ "pad_size",
+ "format",
+ "return_tensors",
+ "data_format",
+ "input_data_format",
+ ]
+
+ @classmethod
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->DeformableDetr
+ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+ """
+ Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+ created using from_dict and kwargs e.g. `DeformableDetrImageProcessor.from_pretrained(checkpoint, size=600,
+ max_size=800)`
+ """
+ image_processor_dict = image_processor_dict.copy()
+ if "max_size" in kwargs:
+ image_processor_dict["max_size"] = kwargs.pop("max_size")
+ if "pad_and_return_pixel_mask" in kwargs:
+ image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+ return super().from_dict(image_processor_dict, **kwargs)
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DeformableDetr
+ def prepare_annotation(
+ self,
+ image: np.ndarray,
+ target: Dict,
+ format: Optional[AnnotationFormat] = None,
+ return_segmentation_masks: Optional[bool] = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> Dict:
+ """
+ Prepare an annotation for feeding into DeformableDetr model.
+ """
+ format = format if format is not None else self.format
+
+ if format == AnnotationFormat.COCO_DETECTION:
+ return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+ target = prepare_coco_detection_annotation(
+ image, target, return_segmentation_masks, input_data_format=input_data_format
+ )
+ elif format == AnnotationFormat.COCO_PANOPTIC:
+ return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+ target = prepare_coco_panoptic_annotation(
+ image,
+ target,
+ masks_path=masks_path,
+ return_masks=return_segmentation_masks,
+ input_data_format=input_data_format,
+ )
+ else:
+ raise ValueError(f"Format {format} is not supported.")
+ return target
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
+ def resize(
+ self,
+ image: np.ndarray,
+ size: Dict[str, int],
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
+ data_format: Optional[ChannelDimension] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+ int, smaller edge of the image will be matched to this number.
+
+ Args:
+ image (`np.ndarray`):
+ Image to resize.
+ size (`Dict[str, int]`):
+ Size of the image's `(height, width)` dimensions after resizing. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+ Resampling filter to use if resizing the image.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
+ image is used.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred.
+ """
+ if "max_size" in kwargs:
+ logger.warning_once(
+ "The `max_size` parameter is deprecated and will be removed in v4.26. "
+ "Please specify in `size['longest_edge'] instead`.",
+ )
+ max_size = kwargs.pop("max_size")
+ else:
+ max_size = None
+ size = get_size_dict(size, max_size=max_size, default_to_square=False)
+ if "shortest_edge" in size and "longest_edge" in size:
+ new_size = get_resize_output_image_size(
+ image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+ )
+ elif "max_height" in size and "max_width" in size:
+ new_size = get_image_size_for_max_height_width(
+ image, size["max_height"], size["max_width"], input_data_format=input_data_format
+ )
+ elif "height" in size and "width" in size:
+ new_size = (size["height"], size["width"])
+ else:
+ raise ValueError(
+ "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+ f" {size.keys()}."
+ )
+ image = resize(
+ image,
+ size=new_size,
+ resample=resample,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ **kwargs,
+ )
+ return image
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
+ def resize_annotation(
+ self,
+ annotation,
+ orig_size,
+ size,
+ resample: PILImageResampling = PILImageResampling.NEAREST,
+ ) -> Dict:
+ """
+ Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+ to this number.
+ """
+ return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+ def rescale(
+ self,
+ image: np.ndarray,
+ rescale_factor: float,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> np.ndarray:
+ """
+ Rescale the image by the given factor. image = image * rescale_factor.
+
+ Args:
+ image (`np.ndarray`):
+ Image to rescale.
+ rescale_factor (`float`):
+ The value to use for rescaling.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
+ image is used. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ input_data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+ one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ """
+ return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
+ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+ """
+ Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+ `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
+ """
+ return normalize_annotation(annotation, image_size=image_size)
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
+ def _update_annotation_for_padded_image(
+ self,
+ annotation: Dict,
+ input_image_size: Tuple[int, int],
+ output_image_size: Tuple[int, int],
+ padding,
+ update_bboxes,
+ ) -> Dict:
+ """
+ Update the annotation for a padded image.
+ """
+ new_annotation = {}
+ new_annotation["size"] = output_image_size
+
+ for key, value in annotation.items():
+ if key == "masks":
+ masks = value
+ masks = pad(
+ masks,
+ padding,
+ mode=PaddingMode.CONSTANT,
+ constant_values=0,
+ input_data_format=ChannelDimension.FIRST,
+ )
+ masks = safe_squeeze(masks, 1)
+ new_annotation["masks"] = masks
+ elif key == "boxes" and update_bboxes:
+ boxes = value
+ boxes *= np.asarray(
+ [
+ input_image_size[1] / output_image_size[1],
+ input_image_size[0] / output_image_size[0],
+ input_image_size[1] / output_image_size[1],
+ input_image_size[0] / output_image_size[0],
+ ]
+ )
+ new_annotation["boxes"] = boxes
+ elif key == "size":
+ new_annotation["size"] = output_image_size
+ else:
+ new_annotation[key] = value
+ return new_annotation
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+ def _pad_image(
+ self,
+ image: np.ndarray,
+ output_size: Tuple[int, int],
+ annotation: Optional[Dict[str, Any]] = None,
+ constant_values: Union[float, Iterable[float]] = 0,
+ data_format: Optional[ChannelDimension] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ update_bboxes: bool = True,
+ ) -> np.ndarray:
+ """
+ Pad an image with zeros to the given size.
+ """
+ input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+ output_height, output_width = output_size
+
+ pad_bottom = output_height - input_height
+ pad_right = output_width - input_width
+ padding = ((0, pad_bottom), (0, pad_right))
+ padded_image = pad(
+ image,
+ padding,
+ mode=PaddingMode.CONSTANT,
+ constant_values=constant_values,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ )
+ if annotation is not None:
+ annotation = self._update_annotation_for_padded_image(
+ annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+ )
+ return padded_image, annotation
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+ def pad(
+ self,
+ images: List[np.ndarray],
+ annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+ constant_values: Union[float, Iterable[float]] = 0,
+ return_pixel_mask: bool = True,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ data_format: Optional[ChannelDimension] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ update_bboxes: bool = True,
+ pad_size: Optional[Dict[str, int]] = None,
+ ) -> BatchFeature:
+ """
+ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+ in the batch and optionally returns their corresponding pixel mask.
+
+ Args:
+ images (List[`np.ndarray`]):
+ Images to pad.
+ annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+ Annotations to transform according to the padding that is applied to the images.
+ constant_values (`float` or `Iterable[float]`, *optional*):
+ The value to use for the padding if `mode` is `"constant"`.
+ return_pixel_mask (`bool`, *optional*, defaults to `True`):
+ Whether to return a pixel mask.
+ return_tensors (`str` or `TensorType`, *optional*):
+ The type of tensors to return. Can be one of:
+ - Unset: Return a list of `np.ndarray`.
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format of the image. If not provided, it will be the same as the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred.
+ update_bboxes (`bool`, *optional*, defaults to `True`):
+ Whether to update the bounding boxes in the annotations to match the padded images. If the
+ bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+ format, the bounding boxes will not be updated.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+ pad_size = pad_size if pad_size is not None else self.pad_size
+ if pad_size is not None:
+ padded_size = (pad_size["height"], pad_size["width"])
+ else:
+ padded_size = get_max_height_width(images, input_data_format=input_data_format)
+
+ annotation_list = annotations if annotations is not None else [None] * len(images)
+ padded_images = []
+ padded_annotations = []
+ for image, annotation in zip(images, annotation_list):
+ padded_image, padded_annotation = self._pad_image(
+ image,
+ padded_size,
+ annotation,
+ constant_values=constant_values,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ update_bboxes=update_bboxes,
+ )
+ padded_images.append(padded_image)
+ padded_annotations.append(padded_annotation)
+
+ data = {"pixel_values": padded_images}
+
+ if return_pixel_mask:
+ masks = [
+ make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
+ for image in images
+ ]
+ data["pixel_mask"] = masks
+
+ encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+ if annotations is not None:
+ encoded_inputs["labels"] = [
+ BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+ ]
+
+ return encoded_inputs
+
+ # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
+ def preprocess(
+ self,
+ images: ImageInput,
+ annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+ return_segmentation_masks: Optional[bool] = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ do_resize: Optional[bool] = None,
+ size: Optional[Dict[str, int]] = None,
+ resample=None, # PILImageResampling
+ do_rescale: Optional[bool] = None,
+ rescale_factor: Optional[Union[int, float]] = None,
+ do_normalize: Optional[bool] = None,
+ do_convert_annotations: Optional[bool] = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ do_pad: Optional[bool] = None,
+ format: Optional[Union[str, AnnotationFormat]] = None,
+ return_tensors: Optional[Union[TensorType, str]] = None,
+ data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ pad_size: Optional[Dict[str, int]] = None,
+ **kwargs,
+ ) -> BatchFeature:
+ """
+ Preprocess an image or a batch of images so that it can be used by the model.
+
+ Args:
+ images (`ImageInput`):
+ Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+ from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+ annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+ List of annotations associated with the image or batch of images. If annotation is for object
+ detection, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+ dictionary. An image can have no annotations, in which case the list should be empty.
+ If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+ An image can have no segments, in which case the list should be empty.
+ - "file_name" (`str`): The file name of the image.
+ return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+ Whether to return segmentation masks.
+ masks_path (`str` or `pathlib.Path`, *optional*):
+ Path to the directory containing the segmentation masks.
+ do_resize (`bool`, *optional*, defaults to self.do_resize):
+ Whether to resize the image.
+ size (`Dict[str, int]`, *optional*, defaults to self.size):
+ Size of the image's `(height, width)` dimensions after resizing. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ resample (`PILImageResampling`, *optional*, defaults to self.resample):
+ Resampling filter to use when resizing the image.
+ do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+ Whether to rescale the image.
+ rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+ Rescale factor to use when rescaling the image.
+ do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+ Whether to normalize the image.
+ do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+ Whether to convert the annotations to the format expected by the model. Converts the bounding
+ boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+ and in relative coordinates.
+ image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+ Mean to use when normalizing the image.
+ image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+ Standard deviation to use when normalizing the image.
+ do_pad (`bool`, *optional*, defaults to self.do_pad):
+ Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+ the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+ dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+ format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+ Format of the annotations.
+ return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+ Type of tensors to return. If `None`, will return the list of images.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ """
+ if "pad_and_return_pixel_mask" in kwargs:
+ logger.warning_once(
+ "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+ "use `do_pad` instead."
+ )
+ do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+ if "max_size" in kwargs:
+ logger.warning_once(
+ "The `max_size` argument is deprecated and will be removed in a future version, use"
+ " `size['longest_edge']` instead."
+ )
+ size = kwargs.pop("max_size")
+
+ do_resize = self.do_resize if do_resize is None else do_resize
+ size = self.size if size is None else size
+ size = get_size_dict(size=size, default_to_square=False)
+ resample = self.resample if resample is None else resample
+ do_rescale = self.do_rescale if do_rescale is None else do_rescale
+ rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+ do_normalize = self.do_normalize if do_normalize is None else do_normalize
+ image_mean = self.image_mean if image_mean is None else image_mean
+ image_std = self.image_std if image_std is None else image_std
+ do_convert_annotations = (
+ self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+ )
+ do_pad = self.do_pad if do_pad is None else do_pad
+ pad_size = self.pad_size if pad_size is None else pad_size
+ format = self.format if format is None else format
+
+ images = make_list_of_images(images)
+
+ if not valid_images(images):
+ raise ValueError(
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+ "torch.Tensor, tf.Tensor or jax.ndarray."
+ )
+ validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+ # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
+ validate_preprocess_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ )
+
+ if annotations is not None and isinstance(annotations, dict):
+ annotations = [annotations]
+
+ if annotations is not None and len(images) != len(annotations):
+ raise ValueError(
+ f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+ )
+
+ format = AnnotationFormat(format)
+ if annotations is not None:
+ validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+ if (
+ masks_path is not None
+ and format == AnnotationFormat.COCO_PANOPTIC
+ and not isinstance(masks_path, (pathlib.Path, str))
+ ):
+ raise ValueError(
+ "The path to the directory containing the mask PNG files should be provided as a"
+ f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+ )
+
+ # All transformations expect numpy arrays
+ images = [to_numpy_array(image) for image in images]
+
+ if do_rescale and is_scaled_image(images[0]):
+ logger.warning_once(
+ "It looks like you are trying to rescale already rescaled images. If the input"
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+ )
+
+ if input_data_format is None:
+ # We assume that all images have the same channel dimension format.
+ input_data_format = infer_channel_dimension_format(images[0])
+
+ # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+ if annotations is not None:
+ prepared_images = []
+ prepared_annotations = []
+ for image, target in zip(images, annotations):
+ target = self.prepare_annotation(
+ image,
+ target,
+ format,
+ return_segmentation_masks=return_segmentation_masks,
+ masks_path=masks_path,
+ input_data_format=input_data_format,
+ )
+ prepared_images.append(image)
+ prepared_annotations.append(target)
+ images = prepared_images
+ annotations = prepared_annotations
+ del prepared_images, prepared_annotations
+
+ # transformations
+ if do_resize:
+ if annotations is not None:
+ resized_images, resized_annotations = [], []
+ for image, target in zip(images, annotations):
+ orig_size = get_image_size(image, input_data_format)
+ resized_image = self.resize(
+ image, size=size, resample=resample, input_data_format=input_data_format
+ )
+ resized_annotation = self.resize_annotation(
+ target, orig_size, get_image_size(resized_image, input_data_format)
+ )
+ resized_images.append(resized_image)
+ resized_annotations.append(resized_annotation)
+ images = resized_images
+ annotations = resized_annotations
+ del resized_images, resized_annotations
+ else:
+ images = [
+ self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+ for image in images
+ ]
+
+ if do_rescale:
+ images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+ if do_normalize:
+ images = [
+ self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+ ]
+
+ if do_convert_annotations and annotations is not None:
+ annotations = [
+ self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+ for annotation, image in zip(annotations, images)
+ ]
+
+ if do_pad:
+ # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+ encoded_inputs = self.pad(
+ images,
+ annotations=annotations,
+ return_pixel_mask=True,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ update_bboxes=do_convert_annotations,
+ return_tensors=return_tensors,
+ pad_size=pad_size,
+ )
+ else:
+ images = [
+ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+ for image in images
+ ]
+ encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+ if annotations is not None:
+ encoded_inputs["labels"] = [
+ BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+ ]
+
+ return encoded_inputs
+
+ # POSTPROCESSING METHODS - TODO: add support for other frameworks
+ def post_process(self, outputs, target_sizes):
+ """
+ Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+ top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+ Args:
+ outputs ([`DeformableDetrObjectDetectionOutput`]):
+ Raw outputs of the model.
+ target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+ Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+ original image size (before any data augmentation). For visualization, this should be the image size
+ after data augment, but before padding.
+ Returns:
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+ in the batch as predicted by the model.
+ """
+ logger.warning_once(
+ "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+ " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+ )
+
+ out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+ if len(out_logits) != len(target_sizes):
+ raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+ if target_sizes.shape[1] != 2:
+ raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+ prob = out_logits.sigmoid()
+ topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+ scores = topk_values
+ topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+ labels = topk_indexes % out_logits.shape[2]
+ boxes = center_to_corners_format(out_bbox)
+ boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+ # and from relative [0, 1] to absolute [0, height] coordinates
+ img_h, img_w = target_sizes.unbind(1)
+ scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+ boxes = boxes * scale_fct[:, None, :]
+
+ results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+ return results
+
+ def post_process_object_detection(
+ self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+ ):
+ """
+ Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+ top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+ Args:
+ outputs ([`DetrObjectDetectionOutput`]):
+ Raw outputs of the model.
+ threshold (`float`, *optional*):
+ Score threshold to keep object detection predictions.
+ target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+ Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+ (height, width) of each image in the batch. If left to None, predictions will not be resized.
+ top_k (`int`, *optional*, defaults to 100):
+ Keep only top k bounding boxes before filtering by thresholding.
+
+ Returns:
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+ in the batch as predicted by the model.
+ """
+ out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+ if target_sizes is not None:
+ if len(out_logits) != len(target_sizes):
+ raise ValueError(
+ "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+ )
+
+ prob = out_logits.sigmoid()
+ prob = prob.view(out_logits.shape[0], -1)
+ k_value = min(top_k, prob.size(1))
+ topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+ scores = topk_values
+ topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+ labels = topk_indexes % out_logits.shape[2]
+ boxes = center_to_corners_format(out_bbox)
+ boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+ # and from relative [0, 1] to absolute [0, height] coordinates
+ if target_sizes is not None:
+ if isinstance(target_sizes, List):
+ img_h = torch.Tensor([i[0] for i in target_sizes])
+ img_w = torch.Tensor([i[1] for i in target_sizes])
+ else:
+ img_h, img_w = target_sizes.unbind(1)
+ scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+ boxes = boxes * scale_fct[:, None, :]
+
+ results = []
+ for s, l, b in zip(scores, labels, boxes):
+ score = s[s > threshold]
+ label = l[s > threshold]
+ box = b[s > threshold]
+ results.append({"scores": score, "labels": label, "boxes": box})
+
+ return results
+
+
+__all__ = ["DeformableDetrImageProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/docs/transformers/build/lib/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a820e07f72f650e7a490f2566a65bc0ea6447a0
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@@ -0,0 +1,845 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deformable_detr/modular_deformable_detr.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deformable_detr.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import pathlib
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+ BaseImageProcessorFast,
+ DefaultFastImageProcessorKwargs,
+ SizeDict,
+ get_image_size_for_max_height_width,
+ get_max_height_width,
+ safe_squeeze,
+)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...image_utils import (
+ IMAGENET_DEFAULT_MEAN,
+ IMAGENET_DEFAULT_STD,
+ AnnotationFormat,
+ AnnotationType,
+ ChannelDimension,
+ ImageInput,
+ PILImageResampling,
+ get_image_size,
+ validate_annotations,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+ TensorType,
+ add_start_docstrings,
+ is_torch_available,
+ is_torchvision_available,
+ is_torchvision_v2_available,
+ logging,
+)
+from ...utils.import_utils import requires
+from .image_processing_deformable_detr import get_size_with_aspect_ratio
+
+
+if is_torch_available():
+ import torch
+
+
+if is_torchvision_v2_available():
+ from torchvision.io import read_image
+ from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+ from torchvision.io import read_image
+ from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+ format: Optional[Union[str, AnnotationFormat]]
+ do_convert_annotations: Optional[bool]
+ do_pad: Optional[bool]
+ pad_size: Optional[Dict[str, int]]
+ return_segmentation_masks: Optional[bool]
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33
+def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
+ """
+ Convert a COCO polygon annotation to a mask.
+
+ Args:
+ segmentations (`List[List[float]]`):
+ List of polygons, each polygon represented by a list of x-y coordinates.
+ height (`int`):
+ Height of the mask.
+ width (`int`):
+ Width of the mask.
+ """
+ try:
+ from pycocotools import mask as coco_mask
+ except ImportError:
+ raise ImportError("Pycocotools is not installed in your environment.")
+
+ masks = []
+ for polygons in segmentations:
+ rles = coco_mask.frPyObjects(polygons, height, width)
+ mask = coco_mask.decode(rles)
+ if len(mask.shape) < 3:
+ mask = mask[..., None]
+ mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
+ mask = torch.any(mask, axis=2)
+ masks.append(mask)
+ if masks:
+ masks = torch.stack(masks, axis=0)
+ else:
+ masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)
+
+ return masks
+
+
+# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L50
+def prepare_coco_detection_annotation(
+ image,
+ target,
+ return_segmentation_masks: bool = False,
+ input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+ """
+ Convert the target in COCO format into the format expected by DEFORMABLE_DETR.
+ """
+ image_height, image_width = image.size()[-2:]
+
+ image_id = target["image_id"]
+ image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+ # Get all COCO annotations for the given image.
+ annotations = target["annotations"]
+ classes = []
+ area = []
+ boxes = []
+ keypoints = []
+ for obj in annotations:
+ if "iscrowd" not in obj or obj["iscrowd"] == 0:
+ classes.append(obj["category_id"])
+ area.append(obj["area"])
+ boxes.append(obj["bbox"])
+ if "keypoints" in obj:
+ keypoints.append(obj["keypoints"])
+
+ classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+ area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+ iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+ # guard against no boxes via resizing
+ boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+ boxes[:, 2:] += boxes[:, :2]
+ boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+ boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+ keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+ new_target = {
+ "image_id": image_id,
+ "class_labels": classes[keep],
+ "boxes": boxes[keep],
+ "area": area[keep],
+ "iscrowd": iscrowd[keep],
+ "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+ }
+
+ if keypoints:
+ keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+ # Apply the keep mask here to filter the relevant annotations
+ keypoints = keypoints[keep]
+ num_keypoints = keypoints.shape[0]
+ keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+ new_target["keypoints"] = keypoints
+
+ if return_segmentation_masks:
+ segmentation_masks = [obj["segmentation"] for obj in annotations]
+ masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
+ new_target["masks"] = masks[keep]
+
+ return new_target
+
+
+def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
+ """
+ Compute the bounding boxes around the provided panoptic segmentation masks.
+
+ Args:
+ masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+ Returns:
+ boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+ """
+ if masks.numel() == 0:
+ return torch.zeros((0, 4), device=masks.device)
+
+ h, w = masks.shape[-2:]
+ y = torch.arange(0, h, dtype=torch.float32, device=masks.device)
+ x = torch.arange(0, w, dtype=torch.float32, device=masks.device)
+ # see https://github.com/pytorch/pytorch/issues/50276
+ y, x = torch.meshgrid(y, x, indexing="ij")
+
+ x_mask = masks * torch.unsqueeze(x, 0)
+ x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0]
+ x_min = (
+ torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+ )
+
+ y_mask = masks * torch.unsqueeze(y, 0)
+ y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0]
+ y_min = (
+ torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+ )
+
+ return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+ """
+ Converts RGB color to unique ID.
+ """
+ if isinstance(color, torch.Tensor) and len(color.shape) == 3:
+ if color.dtype == torch.uint8:
+ color = color.to(torch.int32)
+ return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+ return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def prepare_coco_panoptic_annotation(
+ image: torch.Tensor,
+ target: Dict,
+ masks_path: Union[str, pathlib.Path],
+ return_masks: bool = True,
+ input_data_format: Union[ChannelDimension, str] = None,
+) -> Dict:
+ """
+ Prepare a coco panoptic annotation for DEFORMABLE_DETR.
+ """
+ image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+ annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+ new_target = {}
+ new_target["image_id"] = torch.as_tensor(
+ [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device
+ )
+ new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+ new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+
+ if "segments_info" in target:
+ masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
+ masks = rgb_to_id(masks)
+
+ ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
+ masks = masks == ids[:, None, None]
+ masks = masks.to(torch.bool)
+ if return_masks:
+ new_target["masks"] = masks
+ new_target["boxes"] = masks_to_boxes(masks)
+ new_target["class_labels"] = torch.as_tensor(
+ [segment_info["category_id"] for segment_info in target["segments_info"]],
+ dtype=torch.int64,
+ device=image.device,
+ )
+ new_target["iscrowd"] = torch.as_tensor(
+ [segment_info["iscrowd"] for segment_info in target["segments_info"]],
+ dtype=torch.int64,
+ device=image.device,
+ )
+ new_target["area"] = torch.as_tensor(
+ [segment_info["area"] for segment_info in target["segments_info"]],
+ dtype=torch.float32,
+ device=image.device,
+ )
+
+ return new_target
+
+
+@add_start_docstrings(
+ "Constructs a fast DeformableDetr image processor.",
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+ """
+ format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+ Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+ do_convert_annotations (`bool`, *optional*, defaults to `True`):
+ Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
+ bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+ Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+ do_pad (`bool`, *optional*, defaults to `True`):
+ Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+ method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+ If `pad_size` is provided, the image will be padded to the specified dimensions.
+ Otherwise, the image will be padded to the maximum height and width of the batch.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+ Whether to return segmentation masks.
+ """,
+)
+@requires(backends=("torchvision", "torch"))
+class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
+ resample = PILImageResampling.BILINEAR
+ image_mean = IMAGENET_DEFAULT_MEAN
+ image_std = IMAGENET_DEFAULT_STD
+ format = AnnotationFormat.COCO_DETECTION
+ do_resize = True
+ do_rescale = True
+ do_normalize = True
+ do_pad = True
+ size = {"shortest_edge": 800, "longest_edge": 1333}
+ default_to_square = False
+ model_input_names = ["pixel_values", "pixel_mask"]
+ valid_kwargs = DeformableDetrFastImageProcessorKwargs
+
+ def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None:
+ if "pad_and_return_pixel_mask" in kwargs:
+ kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+
+ size = kwargs.pop("size", None)
+ if "max_size" in kwargs:
+ logger.warning_once(
+ "The `max_size` parameter is deprecated and will be removed in v4.26. "
+ "Please specify in `size['longest_edge'] instead`.",
+ )
+ max_size = kwargs.pop("max_size")
+ else:
+ max_size = None if size is None else 1333
+
+ size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+ self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+ # Backwards compatibility
+ do_convert_annotations = kwargs.get("do_convert_annotations", None)
+ do_normalize = kwargs.get("do_normalize", None)
+ if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
+ self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
+
+ super().__init__(**kwargs)
+
+ @classmethod
+ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+ """
+ Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+ created using from_dict and kwargs e.g. `DeformableDetrImageProcessorFast.from_pretrained(checkpoint, size=600,
+ max_size=800)`
+ """
+ image_processor_dict = image_processor_dict.copy()
+ if "max_size" in kwargs:
+ image_processor_dict["max_size"] = kwargs.pop("max_size")
+ if "pad_and_return_pixel_mask" in kwargs:
+ image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+ return super().from_dict(image_processor_dict, **kwargs)
+
+ def prepare_annotation(
+ self,
+ image: torch.Tensor,
+ target: Dict,
+ format: Optional[AnnotationFormat] = None,
+ return_segmentation_masks: Optional[bool] = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> Dict:
+ """
+ Prepare an annotation for feeding into DEFORMABLE_DETR model.
+ """
+ format = format if format is not None else self.format
+
+ if format == AnnotationFormat.COCO_DETECTION:
+ return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+ target = prepare_coco_detection_annotation(
+ image, target, return_segmentation_masks, input_data_format=input_data_format
+ )
+ elif format == AnnotationFormat.COCO_PANOPTIC:
+ return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+ target = prepare_coco_panoptic_annotation(
+ image,
+ target,
+ masks_path=masks_path,
+ return_masks=return_segmentation_masks,
+ input_data_format=input_data_format,
+ )
+ else:
+ raise ValueError(f"Format {format} is not supported.")
+ return target
+
+ def resize(
+ self,
+ image: torch.Tensor,
+ size: SizeDict,
+ interpolation: "F.InterpolationMode" = None,
+ **kwargs,
+ ) -> torch.Tensor:
+ """
+ Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+ int, smaller edge of the image will be matched to this number.
+
+ Args:
+ image (`torch.Tensor`):
+ Image to resize.
+ size (`SizeDict`):
+ Size of the image's `(height, width)` dimensions after resizing. Available options are:
+ - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+ Do NOT keep the aspect ratio.
+ - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+ the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+ less or equal to `longest_edge`.
+ - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+ aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+ `max_width`.
+ interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+ Resampling filter to use if resizing the image.
+ """
+ interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+ if size.shortest_edge and size.longest_edge:
+ # Resize the image so that the shortest edge or the longest edge is of the given size
+ # while maintaining the aspect ratio of the original image.
+ new_size = get_size_with_aspect_ratio(
+ image.size()[-2:],
+ size["shortest_edge"],
+ size["longest_edge"],
+ )
+ elif size.max_height and size.max_width:
+ new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+ elif size.height and size.width:
+ new_size = (size["height"], size["width"])
+ else:
+ raise ValueError(
+ "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+ f" {size.keys()}."
+ )
+
+ image = F.resize(
+ image,
+ size=new_size,
+ interpolation=interpolation,
+ **kwargs,
+ )
+ return image
+
+ def resize_annotation(
+ self,
+ annotation: Dict[str, Any],
+ orig_size: Tuple[int, int],
+ target_size: Tuple[int, int],
+ threshold: float = 0.5,
+ interpolation: "F.InterpolationMode" = None,
+ ):
+ """
+ Resizes an annotation to a target size.
+
+ Args:
+ annotation (`Dict[str, Any]`):
+ The annotation dictionary.
+ orig_size (`Tuple[int, int]`):
+ The original size of the input image.
+ target_size (`Tuple[int, int]`):
+ The target size of the image, as returned by the preprocessing `resize` step.
+ threshold (`float`, *optional*, defaults to 0.5):
+ The threshold used to binarize the segmentation masks.
+ resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
+ The resampling filter to use when resizing the masks.
+ """
+ interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
+ ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+ new_annotation = {}
+ new_annotation["size"] = target_size
+
+ for key, value in annotation.items():
+ if key == "boxes":
+ boxes = value
+ scaled_boxes = boxes * torch.as_tensor(
+ [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+ )
+ new_annotation["boxes"] = scaled_boxes
+ elif key == "area":
+ area = value
+ scaled_area = area * (ratio_width * ratio_height)
+ new_annotation["area"] = scaled_area
+ elif key == "masks":
+ masks = value[:, None]
+ masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+ masks = torch.stack(masks).to(torch.float32)
+ masks = masks[:, 0] > threshold
+ new_annotation["masks"] = masks
+ elif key == "size":
+ new_annotation["size"] = target_size
+ else:
+ new_annotation[key] = value
+
+ return new_annotation
+
+ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+ image_height, image_width = image_size
+ norm_annotation = {}
+ for key, value in annotation.items():
+ if key == "boxes":
+ boxes = value
+ boxes = corners_to_center_format(boxes)
+ boxes /= torch.as_tensor(
+ [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+ )
+ norm_annotation[key] = boxes
+ else:
+ norm_annotation[key] = value
+ return norm_annotation
+
+ def _update_annotation_for_padded_image(
+ self,
+ annotation: Dict,
+ input_image_size: Tuple[int, int],
+ output_image_size: Tuple[int, int],
+ padding,
+ update_bboxes,
+ ) -> Dict:
+ """
+ Update the annotation for a padded image.
+ """
+ new_annotation = {}
+ new_annotation["size"] = output_image_size
+ ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+ for key, value in annotation.items():
+ if key == "masks":
+ masks = value
+ masks = F.pad(
+ masks,
+ padding,
+ fill=0,
+ )
+ masks = safe_squeeze(masks, 1)
+ new_annotation["masks"] = masks
+ elif key == "boxes" and update_bboxes:
+ boxes = value
+ boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+ new_annotation["boxes"] = boxes
+ elif key == "size":
+ new_annotation["size"] = output_image_size
+ else:
+ new_annotation[key] = value
+ return new_annotation
+
+ def pad(
+ self,
+ image: torch.Tensor,
+ padded_size: Tuple[int, int],
+ annotation: Optional[Dict[str, Any]] = None,
+ update_bboxes: bool = True,
+ fill: int = 0,
+ ):
+ original_size = image.size()[-2:]
+ padding_bottom = padded_size[0] - original_size[0]
+ padding_right = padded_size[1] - original_size[1]
+ if padding_bottom < 0 or padding_right < 0:
+ raise ValueError(
+ f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+ f"original size. Got padded size: {padded_size}, original size: {original_size}."
+ )
+ if original_size != padded_size:
+ padding = [0, 0, padding_right, padding_bottom]
+ image = F.pad(image, padding, fill=fill)
+ if annotation is not None:
+ annotation = self._update_annotation_for_padded_image(
+ annotation, original_size, padded_size, padding, update_bboxes
+ )
+
+ # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+ pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+ pixel_mask[: original_size[0], : original_size[1]] = 1
+
+ return image, pixel_mask, annotation
+
+ @add_start_docstrings(
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+ """
+ annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+ List of annotations associated with the image or batch of images. If annotation is for object
+ detection, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+ dictionary. An image can have no annotations, in which case the list should be empty.
+ If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+ - "image_id" (`int`): The image id.
+ - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+ An image can have no segments, in which case the list should be empty.
+ - "file_name" (`str`): The file name of the image.
+ format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+ Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+ do_convert_annotations (`bool`, *optional*, defaults to `True`):
+ Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
+ bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+ Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+ do_pad (`bool`, *optional*, defaults to `True`):
+ Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+ method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+ If `pad_size` is provided, the image will be padded to the specified dimensions.
+ Otherwise, the image will be padded to the maximum height and width of the batch.
+ pad_size (`Dict[str, int]`, *optional*):
+ The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+ provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+ height and width in the batch.
+ return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+ Whether to return segmentation masks.
+ masks_path (`str` or `pathlib.Path`, *optional*):
+ Path to the directory containing the segmentation masks.
+ """,
+ )
+ def preprocess(
+ self,
+ images: ImageInput,
+ annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+ masks_path: Optional[Union[str, pathlib.Path]] = None,
+ **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs],
+ ) -> BatchFeature:
+ if "pad_and_return_pixel_mask" in kwargs:
+ kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+ logger.warning_once(
+ "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+ "use `do_pad` instead."
+ )
+
+ if "max_size" in kwargs:
+ logger.warning_once(
+ "The `max_size` argument is deprecated and will be removed in a future version, use"
+ " `size['longest_edge']` instead."
+ )
+ kwargs["size"] = kwargs.pop("max_size")
+
+ return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
+
+ def _preprocess(
+ self,
+ images: List["torch.Tensor"],
+ annotations: Optional[Union[AnnotationType, List[AnnotationType]]],
+ return_segmentation_masks: bool,
+ masks_path: Optional[Union[str, pathlib.Path]],
+ do_resize: bool,
+ size: SizeDict,
+ interpolation: Optional["F.InterpolationMode"],
+ do_center_crop: bool,
+ crop_size: SizeDict,
+ do_rescale: bool,
+ rescale_factor: float,
+ do_normalize: bool,
+ do_convert_annotations: bool,
+ image_mean: Optional[Union[float, List[float]]],
+ image_std: Optional[Union[float, List[float]]],
+ do_pad: bool,
+ pad_size: Optional[Dict[str, int]],
+ format: Optional[Union[str, AnnotationFormat]],
+ return_tensors: Optional[Union[str, TensorType]],
+ ) -> BatchFeature:
+ """
+ Preprocess an image or a batch of images so that it can be used by the model.
+ """
+ if annotations is not None and isinstance(annotations, dict):
+ annotations = [annotations]
+
+ if annotations is not None and len(images) != len(annotations):
+ raise ValueError(
+ f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+ )
+
+ format = AnnotationFormat(format)
+ if annotations is not None:
+ validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+ if (
+ masks_path is not None
+ and format == AnnotationFormat.COCO_PANOPTIC
+ and not isinstance(masks_path, (pathlib.Path, str))
+ ):
+ raise ValueError(
+ "The path to the directory containing the mask PNG files should be provided as a"
+ f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+ )
+
+ data = {}
+
+ processed_images = []
+ processed_annotations = []
+ pixel_masks = [] # Initialize pixel_masks here
+ for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+ # prepare (COCO annotations as a list of Dict -> DEFORMABLE_DETR target as a single Dict per image)
+ if annotations is not None:
+ annotation = self.prepare_annotation(
+ image,
+ annotation,
+ format,
+ return_segmentation_masks=return_segmentation_masks,
+ masks_path=masks_path,
+ input_data_format=ChannelDimension.FIRST,
+ )
+
+ if do_resize:
+ resized_image = self.resize(image, size=size, interpolation=interpolation)
+ if annotations is not None:
+ annotation = self.resize_annotation(
+ annotation,
+ orig_size=image.size()[-2:],
+ target_size=resized_image.size()[-2:],
+ )
+ image = resized_image
+ # Fused rescale and normalize
+ image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+ if do_convert_annotations and annotations is not None:
+ annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
+
+ processed_images.append(image)
+ processed_annotations.append(annotation)
+ images = processed_images
+ annotations = processed_annotations if annotations is not None else None
+
+ if do_pad:
+ # depends on all resized image shapes so we need another loop
+ if pad_size is not None:
+ padded_size = (pad_size["height"], pad_size["width"])
+ else:
+ padded_size = get_max_height_width(images)
+
+ padded_images = []
+ padded_annotations = []
+ for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+ # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+ if padded_size == image.size()[-2:]:
+ padded_images.append(image)
+ pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+ padded_annotations.append(annotation)
+ continue
+ image, pixel_mask, annotation = self.pad(
+ image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+ )
+ padded_images.append(image)
+ padded_annotations.append(annotation)
+ pixel_masks.append(pixel_mask)
+ images = padded_images
+ annotations = padded_annotations if annotations is not None else None
+ data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+ data.update({"pixel_values": torch.stack(images, dim=0)})
+ encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+ if annotations is not None:
+ encoded_inputs["labels"] = [
+ BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+ ]
+ return encoded_inputs
+
+ def post_process(self, outputs, target_sizes):
+ """
+ Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+ top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+ Args:
+ outputs ([`DeformableDetrObjectDetectionOutput`]):
+ Raw outputs of the model.
+ target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+ Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+ original image size (before any data augmentation). For visualization, this should be the image size
+ after data augment, but before padding.
+ Returns:
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+ in the batch as predicted by the model.
+ """
+ logger.warning_once(
+ "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+ " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+ )
+
+ out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+ if len(out_logits) != len(target_sizes):
+ raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+ if target_sizes.shape[1] != 2:
+ raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+ prob = out_logits.sigmoid()
+ topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+ scores = topk_values
+ topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+ labels = topk_indexes % out_logits.shape[2]
+ boxes = center_to_corners_format(out_bbox)
+ boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+ # and from relative [0, 1] to absolute [0, height] coordinates
+ img_h, img_w = target_sizes.unbind(1)
+ scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+ boxes = boxes * scale_fct[:, None, :]
+
+ results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+ return results
+
+ def post_process_object_detection(
+ self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+ ):
+ """
+ Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+ top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+ Args:
+ outputs ([`DetrObjectDetectionOutput`]):
+ Raw outputs of the model.
+ threshold (`float`, *optional*):
+ Score threshold to keep object detection predictions.
+ target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+ Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+ (height, width) of each image in the batch. If left to None, predictions will not be resized.
+ top_k (`int`, *optional*, defaults to 100):
+ Keep only top k bounding boxes before filtering by thresholding.
+
+ Returns:
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+ in the batch as predicted by the model.
+ """
+ out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+ if target_sizes is not None:
+ if len(out_logits) != len(target_sizes):
+ raise ValueError(
+ "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+ )
+
+ prob = out_logits.sigmoid()
+ prob = prob.view(out_logits.shape[0], -1)
+ k_value = min(top_k, prob.size(1))
+ topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+ scores = topk_values
+ topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+ labels = topk_indexes % out_logits.shape[2]
+ boxes = center_to_corners_format(out_bbox)
+ boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+ # and from relative [0, 1] to absolute [0, height] coordinates
+ if target_sizes is not None:
+ if isinstance(target_sizes, List):
+ img_h = torch.Tensor([i[0] for i in target_sizes])
+ img_w = torch.Tensor([i[1] for i in target_sizes])
+ else:
+ img_h, img_w = target_sizes.unbind(1)
+ scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+ boxes = boxes * scale_fct[:, None, :]
+
+ results = []
+ for s, l, b in zip(scores, labels, boxes):
+ score = s[s > threshold]
+ label = l[s > threshold]
+ box = b[s > threshold]
+ results.append({"scores": score, "labels": label, "boxes": box})
+
+ return results
+
+
+__all__ = ["DeformableDetrImageProcessorFast"]
diff --git a/docs/transformers/build/lib/transformers/models/deformable_detr/modeling_deformable_detr.py b/docs/transformers/build/lib/transformers/models/deformable_detr/modeling_deformable_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e177dde1a424d719cb6e4ef530df92209afb7f1
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -0,0 +1,2020 @@
+# coding=utf-8
+# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Deformable DETR model."""
+
+import copy
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...integrations import use_kernel_forward_from_hub
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import meshgrid
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_timm_available,
+ logging,
+ replace_return_docstrings,
+ requires_backends,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_deformable_detr import DeformableDetrConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_timm_available():
+ from timm import create_model
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DeformableDetrConfig"
+_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr"
+
+
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+class MultiScaleDeformableAttention(nn.Module):
+ def forward(
+ self,
+ value: Tensor,
+ value_spatial_shapes: Tensor,
+ value_spatial_shapes_list: List[Tuple],
+ level_start_index: Tensor,
+ sampling_locations: Tensor,
+ attention_weights: Tensor,
+ im2col_step: int,
+ ):
+ batch_size, _, num_heads, hidden_dim = value.shape
+ _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+ value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+ sampling_grids = 2 * sampling_locations - 1
+ sampling_value_list = []
+ for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+ # batch_size, height*width, num_heads, hidden_dim
+ # -> batch_size, height*width, num_heads*hidden_dim
+ # -> batch_size, num_heads*hidden_dim, height*width
+ # -> batch_size*num_heads, hidden_dim, height, width
+ value_l_ = (
+ value_list[level_id]
+ .flatten(2)
+ .transpose(1, 2)
+ .reshape(batch_size * num_heads, hidden_dim, height, width)
+ )
+ # batch_size, num_queries, num_heads, num_points, 2
+ # -> batch_size, num_heads, num_queries, num_points, 2
+ # -> batch_size*num_heads, num_queries, num_points, 2
+ sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+ # batch_size*num_heads, hidden_dim, num_queries, num_points
+ sampling_value_l_ = nn.functional.grid_sample(
+ value_l_,
+ sampling_grid_l_,
+ mode="bilinear",
+ padding_mode="zeros",
+ align_corners=False,
+ )
+ sampling_value_list.append(sampling_value_l_)
+ # (batch_size, num_queries, num_heads, num_levels, num_points)
+ # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+ # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+ attention_weights = attention_weights.transpose(1, 2).reshape(
+ batch_size * num_heads, 1, num_queries, num_levels * num_points
+ )
+ output = (
+ (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+ .sum(-1)
+ .view(batch_size, num_heads * hidden_dim, num_queries)
+ )
+ return output.transpose(1, 2).contiguous()
+
+
+@dataclass
+class DeformableDetrDecoderOutput(ModelOutput):
+ """
+ Base class for outputs of the DeformableDetrDecoder. This class adds two attributes to
+ BaseModelOutputWithCrossAttentions, namely:
+ - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+ - a stacked tensor of intermediate reference points.
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+ Stacked intermediate hidden states (output of each layer of the decoder).
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+ Stacked intermediate reference points (reference points of each layer of the decoder).
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+ plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ the self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+ used to compute the weighted average in the cross-attention heads.
+ """
+
+ last_hidden_state: Optional[torch.FloatTensor] = None
+ intermediate_hidden_states: Optional[torch.FloatTensor] = None
+ intermediate_reference_points: Optional[torch.FloatTensor] = None
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
+ cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class DeformableDetrModelOutput(ModelOutput):
+ """
+ Base class for outputs of the Deformable DETR encoder-decoder model.
+
+ Args:
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+ Initial reference points sent through the Transformer decoder.
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+ Stacked intermediate hidden states (output of each layer of the decoder).
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+ Stacked intermediate reference points (reference points of each layer of the decoder).
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+ plus the initial embedding outputs.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+ num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+ average in the self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+ layer plus the initial embedding outputs.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+ Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+ foreground and background).
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Logits of predicted bounding boxes coordinates in the first stage.
+ """
+
+ init_reference_points: Optional[torch.FloatTensor] = None
+ last_hidden_state: Optional[torch.FloatTensor] = None
+ intermediate_hidden_states: Optional[torch.FloatTensor] = None
+ intermediate_reference_points: Optional[torch.FloatTensor] = None
+ decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ enc_outputs_class: Optional[torch.FloatTensor] = None
+ enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class DeformableDetrObjectDetectionOutput(ModelOutput):
+ """
+ Output type of [`DeformableDetrForObjectDetection`].
+
+ Args:
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+ Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+ bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+ scale-invariant IoU loss.
+ loss_dict (`Dict`, *optional*):
+ A dictionary containing the individual losses. Useful for logging.
+ logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+ Classification logits (including no-object) for all queries.
+ pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+ Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+ values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+ possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the
+ unnormalized bounding boxes.
+ auxiliary_outputs (`list[Dict]`, *optional*):
+ Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+ and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+ `pred_boxes`) for each decoder layer.
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+ plus the initial embedding outputs.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+ num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+ average in the self-attention heads.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+ layer plus the initial embedding outputs.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
+ 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
+ in the self-attention heads.
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+ Stacked intermediate hidden states (output of each layer of the decoder).
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+ Stacked intermediate reference points (reference points of each layer of the decoder).
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+ Initial reference points sent through the Transformer decoder.
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+ foreground and background).
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+ Logits of predicted bounding boxes coordinates in the first stage.
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ loss_dict: Optional[Dict] = None
+ logits: Optional[torch.FloatTensor] = None
+ pred_boxes: Optional[torch.FloatTensor] = None
+ auxiliary_outputs: Optional[List[Dict]] = None
+ init_reference_points: Optional[torch.FloatTensor] = None
+ last_hidden_state: Optional[torch.FloatTensor] = None
+ intermediate_hidden_states: Optional[torch.FloatTensor] = None
+ intermediate_reference_points: Optional[torch.FloatTensor] = None
+ decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ enc_outputs_class: Optional = None
+ enc_outputs_coord_logits: Optional = None
+
+
+def _get_clones(module, N):
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+ x = x.clamp(min=0, max=1)
+ x1 = x.clamp(min=eps)
+ x2 = (1 - x).clamp(min=eps)
+ return torch.log(x1 / x2)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->DeformableDetr
+class DeformableDetrFrozenBatchNorm2d(nn.Module):
+ """
+ BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+ Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+ torchvision.models.resnet[18,34,50,101] produce nans.
+ """
+
+ def __init__(self, n):
+ super().__init__()
+ self.register_buffer("weight", torch.ones(n))
+ self.register_buffer("bias", torch.zeros(n))
+ self.register_buffer("running_mean", torch.zeros(n))
+ self.register_buffer("running_var", torch.ones(n))
+
+ def _load_from_state_dict(
+ self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+ ):
+ num_batches_tracked_key = prefix + "num_batches_tracked"
+ if num_batches_tracked_key in state_dict:
+ del state_dict[num_batches_tracked_key]
+
+ super()._load_from_state_dict(
+ state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+ )
+
+ def forward(self, x):
+ # move reshapes to the beginning
+ # to make it user-friendly
+ weight = self.weight.reshape(1, -1, 1, 1)
+ bias = self.bias.reshape(1, -1, 1, 1)
+ running_var = self.running_var.reshape(1, -1, 1, 1)
+ running_mean = self.running_mean.reshape(1, -1, 1, 1)
+ epsilon = 1e-5
+ scale = weight * (running_var + epsilon).rsqrt()
+ bias = bias - running_mean * scale
+ return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->DeformableDetr
+def replace_batch_norm(model):
+ r"""
+ Recursively replace all `torch.nn.BatchNorm2d` with `DeformableDetrFrozenBatchNorm2d`.
+
+ Args:
+ model (torch.nn.Module):
+ input model
+ """
+ for name, module in model.named_children():
+ if isinstance(module, nn.BatchNorm2d):
+ new_module = DeformableDetrFrozenBatchNorm2d(module.num_features)
+
+ if not module.weight.device == torch.device("meta"):
+ new_module.weight.data.copy_(module.weight)
+ new_module.bias.data.copy_(module.bias)
+ new_module.running_mean.data.copy_(module.running_mean)
+ new_module.running_var.data.copy_(module.running_var)
+
+ model._modules[name] = new_module
+
+ if len(list(module.children())) > 0:
+ replace_batch_norm(module)
+
+
+class DeformableDetrConvEncoder(nn.Module):
+ """
+ Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+
+ nn.BatchNorm2d layers are replaced by DeformableDetrFrozenBatchNorm2d as defined above.
+
+ """
+
+ def __init__(self, config):
+ super().__init__()
+
+ self.config = config
+
+ # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
+ if config.use_timm_backbone:
+ # We default to values which were previously hard-coded. This enables configurability from the config
+ # using backbone arguments, while keeping the default behavior the same.
+ requires_backends(self, ["timm"])
+ kwargs = getattr(config, "backbone_kwargs", {})
+ kwargs = {} if kwargs is None else kwargs.copy()
+ out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,))
+ num_channels = kwargs.pop("in_chans", config.num_channels)
+ if config.dilation:
+ kwargs["output_stride"] = kwargs.get("output_stride", 16)
+ backbone = create_model(
+ config.backbone,
+ pretrained=config.use_pretrained_backbone,
+ features_only=True,
+ out_indices=out_indices,
+ in_chans=num_channels,
+ **kwargs,
+ )
+ else:
+ backbone = load_backbone(config)
+
+ # replace batch norm by frozen batch norm
+ with torch.no_grad():
+ replace_batch_norm(backbone)
+ self.model = backbone
+ self.intermediate_channel_sizes = (
+ self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+ )
+
+ backbone_model_type = None
+ if config.backbone is not None:
+ backbone_model_type = config.backbone
+ elif config.backbone_config is not None:
+ backbone_model_type = config.backbone_config.model_type
+ else:
+ raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
+
+ if "resnet" in backbone_model_type:
+ for name, parameter in self.model.named_parameters():
+ if config.use_timm_backbone:
+ if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+ parameter.requires_grad_(False)
+ else:
+ if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+ parameter.requires_grad_(False)
+
+ # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->DeformableDetr
+ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+ # send pixel_values through the model to get list of feature maps
+ features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+
+ out = []
+ for feature_map in features:
+ # downsample pixel_mask to match shape of corresponding feature_map
+ mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+ out.append((feature_map, mask))
+ return out
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->DeformableDetr
+class DeformableDetrConvModel(nn.Module):
+ """
+ This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+ """
+
+ def __init__(self, conv_encoder, position_embedding):
+ super().__init__()
+ self.conv_encoder = conv_encoder
+ self.position_embedding = position_embedding
+
+ def forward(self, pixel_values, pixel_mask):
+ # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+ out = self.conv_encoder(pixel_values, pixel_mask)
+ pos = []
+ for feature_map, mask in out:
+ # position encoding
+ pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+ return out, pos
+
+
+class DeformableDetrSinePositionEmbedding(nn.Module):
+ """
+ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+ need paper, generalized to work on images.
+ """
+
+ def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+ super().__init__()
+ self.embedding_dim = embedding_dim
+ self.temperature = temperature
+ self.normalize = normalize
+ if scale is not None and normalize is False:
+ raise ValueError("normalize should be True if scale is passed")
+ if scale is None:
+ scale = 2 * math.pi
+ self.scale = scale
+
+ def forward(self, pixel_values, pixel_mask):
+ if pixel_mask is None:
+ raise ValueError("No pixel mask provided")
+ y_embed = pixel_mask.cumsum(1, dtype=pixel_values.dtype)
+ x_embed = pixel_mask.cumsum(2, dtype=pixel_values.dtype)
+ if self.normalize:
+ eps = 1e-6
+ y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+ x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+
+ dim_t = torch.arange(self.embedding_dim, dtype=pixel_values.dtype, device=pixel_values.device)
+ dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+ pos_x = x_embed[:, :, :, None] / dim_t
+ pos_y = y_embed[:, :, :, None] / dim_t
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+ return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
+class DeformableDetrLearnedPositionEmbedding(nn.Module):
+ """
+ This module learns positional embeddings up to a fixed maximum size.
+ """
+
+ def __init__(self, embedding_dim=256):
+ super().__init__()
+ self.row_embeddings = nn.Embedding(50, embedding_dim)
+ self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+ def forward(self, pixel_values, pixel_mask=None):
+ height, width = pixel_values.shape[-2:]
+ width_values = torch.arange(width, device=pixel_values.device)
+ height_values = torch.arange(height, device=pixel_values.device)
+ x_emb = self.column_embeddings(width_values)
+ y_emb = self.row_embeddings(height_values)
+ pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+ pos = pos.permute(2, 0, 1)
+ pos = pos.unsqueeze(0)
+ pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+ return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->DeformableDetr
+def build_position_encoding(config):
+ n_steps = config.d_model // 2
+ if config.position_embedding_type == "sine":
+ # TODO find a better way of exposing other arguments
+ position_embedding = DeformableDetrSinePositionEmbedding(n_steps, normalize=True)
+ elif config.position_embedding_type == "learned":
+ position_embedding = DeformableDetrLearnedPositionEmbedding(n_steps)
+ else:
+ raise ValueError(f"Not supported {config.position_embedding_type}")
+
+ return position_embedding
+
+
+class DeformableDetrMultiscaleDeformableAttention(nn.Module):
+ """
+ Multiscale deformable attention as proposed in Deformable DETR.
+ """
+
+ def __init__(self, config: DeformableDetrConfig, num_heads: int, n_points: int):
+ super().__init__()
+
+ self.attn = MultiScaleDeformableAttention()
+
+ if config.d_model % num_heads != 0:
+ raise ValueError(
+ f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+ )
+ dim_per_head = config.d_model // num_heads
+ # check if dim_per_head is power of 2
+ if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+ warnings.warn(
+ "You'd better set embed_dim (d_model) in DeformableDetrMultiscaleDeformableAttention to make the"
+ " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+ " implementation."
+ )
+
+ self.im2col_step = 64
+
+ self.d_model = config.d_model
+ self.n_levels = config.num_feature_levels
+ self.n_heads = num_heads
+ self.n_points = n_points
+
+ self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+ self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+ self.value_proj = nn.Linear(config.d_model, config.d_model)
+ self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+ self.disable_custom_kernels = config.disable_custom_kernels
+
+ def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+ return tensor if position_embeddings is None else tensor + position_embeddings
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ position_embeddings: Optional[torch.Tensor] = None,
+ reference_points=None,
+ spatial_shapes=None,
+ spatial_shapes_list=None,
+ level_start_index=None,
+ output_attentions: bool = False,
+ ):
+ # add position embeddings to the hidden states before projecting to queries and keys
+ if position_embeddings is not None:
+ hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+ batch_size, num_queries, _ = hidden_states.shape
+ batch_size, sequence_length, _ = encoder_hidden_states.shape
+ total_elements = sum(height * width for height, width in spatial_shapes_list)
+ if total_elements != sequence_length:
+ raise ValueError(
+ "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+ )
+
+ value = self.value_proj(encoder_hidden_states)
+ if attention_mask is not None:
+ # we invert the attention_mask
+ value = value.masked_fill(~attention_mask[..., None], float(0))
+ value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+ sampling_offsets = self.sampling_offsets(hidden_states).view(
+ batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+ )
+ attention_weights = self.attention_weights(hidden_states).view(
+ batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+ )
+ attention_weights = F.softmax(attention_weights, -1).view(
+ batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+ )
+ # batch_size, num_queries, n_heads, n_levels, n_points, 2
+ num_coordinates = reference_points.shape[-1]
+ if num_coordinates == 2:
+ offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+ sampling_locations = (
+ reference_points[:, :, None, :, None, :]
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+ )
+ elif num_coordinates == 4:
+ sampling_locations = (
+ reference_points[:, :, None, :, None, :2]
+ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+ )
+ else:
+ raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+ output = self.attn(
+ value,
+ spatial_shapes,
+ spatial_shapes_list,
+ level_start_index,
+ sampling_locations,
+ attention_weights,
+ self.im2col_step,
+ )
+
+ output = self.output_proj(output)
+
+ return output, attention_weights
+
+
+class DeformableDetrMultiheadAttention(nn.Module):
+ """
+ Multi-headed attention from 'Attention Is All You Need' paper.
+
+ Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
+ """
+
+ def __init__(
+ self,
+ embed_dim: int,
+ num_heads: int,
+ dropout: float = 0.0,
+ bias: bool = True,
+ ):
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.head_dim = embed_dim // num_heads
+ if self.head_dim * num_heads != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+ f" {num_heads})."
+ )
+ self.scaling = self.head_dim**-0.5
+
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+ return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+ return tensor if position_embeddings is None else tensor + position_embeddings
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_embeddings: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+
+ batch_size, target_len, embed_dim = hidden_states.size()
+ # add position embeddings to the hidden states before projecting to queries and keys
+ if position_embeddings is not None:
+ hidden_states_original = hidden_states
+ hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+ # get queries, keys and values
+ query_states = self.q_proj(hidden_states) * self.scaling
+ key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+ value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+ proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+ query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+ key_states = key_states.view(*proj_shape)
+ value_states = value_states.view(*proj_shape)
+
+ source_len = key_states.size(1)
+
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+ if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+ raise ValueError(
+ f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ # expand attention_mask
+ if attention_mask is not None:
+ # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+ attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+ if attention_mask is not None:
+ if attention_mask.size() != (batch_size, 1, target_len, source_len):
+ raise ValueError(
+ f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+ f" {attention_mask.size()}"
+ )
+ attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+ attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+ if output_attentions:
+ # this operation is a bit awkward, but it's required to
+ # make sure that attn_weights keeps its gradient.
+ # In order to do so, attn_weights have to reshaped
+ # twice and have to be reused in the following
+ attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+ attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+ else:
+ attn_weights_reshaped = None
+
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+ attn_output = torch.bmm(attn_probs, value_states)
+
+ if attn_output.size() != (
+ batch_size * self.num_heads,
+ target_len,
+ self.head_dim,
+ ):
+ raise ValueError(
+ f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+ attn_output = attn_output.transpose(1, 2)
+ attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+ attn_output = self.out_proj(attn_output)
+
+ return attn_output, attn_weights_reshaped
+
+
+class DeformableDetrEncoderLayer(nn.Module):
+ def __init__(self, config: DeformableDetrConfig):
+ super().__init__()
+ self.embed_dim = config.d_model
+ self.self_attn = DeformableDetrMultiscaleDeformableAttention(
+ config,
+ num_heads=config.encoder_attention_heads,
+ n_points=config.encoder_n_points,
+ )
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+ self.dropout = config.dropout
+ self.activation_fn = ACT2FN[config.activation_function]
+ self.activation_dropout = config.activation_dropout
+ self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+ self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+ self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ position_embeddings: Optional[torch.Tensor] = None,
+ reference_points=None,
+ spatial_shapes=None,
+ spatial_shapes_list=None,
+ level_start_index=None,
+ output_attentions: bool = False,
+ ):
+ """
+ Args:
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Input to the layer.
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+ Attention mask.
+ position_embeddings (`torch.FloatTensor`, *optional*):
+ Position embeddings, to be added to `hidden_states`.
+ reference_points (`torch.FloatTensor`, *optional*):
+ Reference points.
+ spatial_shapes (`torch.LongTensor`, *optional*):
+ Spatial shapes of the backbone feature maps.
+ level_start_index (`torch.LongTensor`, *optional*):
+ Level start index.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+
+ # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ encoder_hidden_states=hidden_states,
+ encoder_attention_mask=attention_mask,
+ position_embeddings=position_embeddings,
+ reference_points=reference_points,
+ spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
+ level_start_index=level_start_index,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ residual = hidden_states
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+ hidden_states = self.fc2(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ hidden_states = residual + hidden_states
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ if self.training:
+ if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+class DeformableDetrDecoderLayer(nn.Module):
+ def __init__(self, config: DeformableDetrConfig):
+ super().__init__()
+ self.embed_dim = config.d_model
+
+ # self-attention
+ self.self_attn = DeformableDetrMultiheadAttention(
+ embed_dim=self.embed_dim,
+ num_heads=config.decoder_attention_heads,
+ dropout=config.attention_dropout,
+ )
+ self.dropout = config.dropout
+ self.activation_fn = ACT2FN[config.activation_function]
+ self.activation_dropout = config.activation_dropout
+
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+ # cross-attention
+ self.encoder_attn = DeformableDetrMultiscaleDeformableAttention(
+ config,
+ num_heads=config.decoder_attention_heads,
+ n_points=config.decoder_n_points,
+ )
+ self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+ # feedforward neural networks
+ self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+ self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+ self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_embeddings: Optional[torch.Tensor] = None,
+ reference_points=None,
+ spatial_shapes=None,
+ spatial_shapes_list=None,
+ level_start_index=None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = False,
+ ):
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`):
+ Input to the layer of shape `(seq_len, batch, embed_dim)`.
+ position_embeddings (`torch.FloatTensor`, *optional*):
+ Position embeddings that are added to the queries and keys in the self-attention layer.
+ reference_points (`torch.FloatTensor`, *optional*):
+ Reference points.
+ spatial_shapes (`torch.LongTensor`, *optional*):
+ Spatial shapes.
+ level_start_index (`torch.LongTensor`, *optional*):
+ Level start index.
+ encoder_hidden_states (`torch.FloatTensor`):
+ cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+ values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+
+ # Self Attention
+ hidden_states, self_attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ position_embeddings=position_embeddings,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ second_residual = hidden_states
+
+ # Cross-Attention
+ cross_attn_weights = None
+ hidden_states, cross_attn_weights = self.encoder_attn(
+ hidden_states=hidden_states,
+ attention_mask=encoder_attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ position_embeddings=position_embeddings,
+ reference_points=reference_points,
+ spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
+ level_start_index=level_start_index,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = second_residual + hidden_states
+
+ hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+ hidden_states = self.fc2(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights, cross_attn_weights)
+
+ return outputs
+
+
+class DeformableDetrPreTrainedModel(PreTrainedModel):
+ config_class = DeformableDetrConfig
+ base_model_prefix = "model"
+ main_input_name = "pixel_values"
+ supports_gradient_checkpointing = True
+ _no_split_modules = [
+ r"DeformableDetrConvEncoder",
+ r"DeformableDetrEncoderLayer",
+ r"DeformableDetrDecoderLayer",
+ ]
+
+ def _init_weights(self, module):
+ std = self.config.init_std
+
+ if isinstance(module, DeformableDetrLearnedPositionEmbedding):
+ nn.init.uniform_(module.row_embeddings.weight)
+ nn.init.uniform_(module.column_embeddings.weight)
+ elif isinstance(module, DeformableDetrMultiscaleDeformableAttention):
+ nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+ default_dtype = torch.get_default_dtype()
+ thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+ 2.0 * math.pi / module.n_heads
+ )
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+ grid_init = (
+ (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+ .view(module.n_heads, 1, 1, 2)
+ .repeat(1, module.n_levels, module.n_points, 1)
+ )
+ for i in range(module.n_points):
+ grid_init[:, :, i, :] *= i + 1
+ with torch.no_grad():
+ module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+ nn.init.constant_(module.attention_weights.weight.data, 0.0)
+ nn.init.constant_(module.attention_weights.bias.data, 0.0)
+ nn.init.xavier_uniform_(module.value_proj.weight.data)
+ nn.init.constant_(module.value_proj.bias.data, 0.0)
+ nn.init.xavier_uniform_(module.output_proj.weight.data)
+ nn.init.constant_(module.output_proj.bias.data, 0.0)
+ elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ if hasattr(module, "reference_points") and not self.config.two_stage:
+ nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+ nn.init.constant_(module.reference_points.bias.data, 0.0)
+ if hasattr(module, "level_embed"):
+ nn.init.normal_(module.level_embed)
+
+
+DEFORMABLE_DETR_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`DeformableDetrConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEFORMABLE_DETR_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Padding will be ignored by default should you provide it.
+
+ Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`]
+ for details.
+
+ pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+ Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+ - 1 for pixels that are real (i.e. **not masked**),
+ - 0 for pixels that are padding (i.e. **masked**).
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+ Not used by default. Can be used to mask object queries.
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+ `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+ hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+ can choose to directly pass a flattened representation of an image.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+ embedded representation.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class DeformableDetrEncoder(DeformableDetrPreTrainedModel):
+ """
+ Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+ [`DeformableDetrEncoderLayer`].
+
+ The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+ Args:
+ config: DeformableDetrConfig
+ """
+
+ def __init__(self, config: DeformableDetrConfig):
+ super().__init__(config)
+ self.gradient_checkpointing = False
+
+ self.dropout = config.dropout
+ self.layers = nn.ModuleList([DeformableDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @staticmethod
+ def get_reference_points(spatial_shapes, valid_ratios, device):
+ """
+ Get reference points for each feature map. Used in decoder.
+
+ Args:
+ spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+ Spatial shapes of each feature map.
+ valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+ Valid ratios of each feature map.
+ device (`torch.device`):
+ Device on which to create the tensors.
+ Returns:
+ `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+ """
+ reference_points_list = []
+ for level, (height, width) in enumerate(spatial_shapes):
+ ref_y, ref_x = meshgrid(
+ torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
+ torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
+ indexing="ij",
+ )
+ # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
+ ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
+ ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
+ ref = torch.stack((ref_x, ref_y), -1)
+ reference_points_list.append(ref)
+ reference_points = torch.cat(reference_points_list, 1)
+ reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+ return reference_points
+
+ def forward(
+ self,
+ inputs_embeds=None,
+ attention_mask=None,
+ position_embeddings=None,
+ spatial_shapes=None,
+ spatial_shapes_list=None,
+ level_start_index=None,
+ valid_ratios=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+ - 1 for pixel features that are real (i.e. **not masked**),
+ - 0 for pixel features that are padding (i.e. **masked**).
+ [What are attention masks?](../glossary#attention-mask)
+ position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Position embeddings that are added to the queries and keys in each self-attention layer.
+ spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+ Spatial shapes of each feature map.
+ level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+ Starting index of each feature map.
+ valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+ Ratio of valid area in each feature level.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ hidden_states = inputs_embeds
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ spatial_shapes_tuple = tuple(spatial_shapes_list)
+ reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device)
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+ for i, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ encoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ position_embeddings,
+ reference_points,
+ spatial_shapes,
+ spatial_shapes_list,
+ level_start_index,
+ output_attentions,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ position_embeddings=position_embeddings,
+ reference_points=reference_points,
+ spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
+ level_start_index=level_start_index,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=encoder_states,
+ attentions=all_attentions,
+ )
+
+
+class DeformableDetrDecoder(DeformableDetrPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DeformableDetrDecoderLayer`].
+
+ The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+ Some tweaks for Deformable DETR:
+
+ - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+ - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+ Args:
+ config: DeformableDetrConfig
+ """
+
+ def __init__(self, config: DeformableDetrConfig):
+ super().__init__(config)
+
+ self.dropout = config.dropout
+ self.layers = nn.ModuleList([DeformableDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+ self.gradient_checkpointing = False
+
+ # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+ self.bbox_embed = None
+ self.class_embed = None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def forward(
+ self,
+ inputs_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ position_embeddings=None,
+ reference_points=None,
+ spatial_shapes=None,
+ spatial_shapes_list=None,
+ level_start_index=None,
+ valid_ratios=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+ The query embeddings that are passed into the decoder.
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+ of the decoder.
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+ in `[0, 1]`:
+ - 1 for pixels that are real (i.e. **not masked**),
+ - 0 for pixels that are padding (i.e. **masked**).
+ position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+ Position embeddings that are added to the queries and keys in each self-attention layer.
+ reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+ Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+ spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+ Spatial shapes of the feature maps.
+ level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+ Indexes for the start of each feature level. In range `[0, sequence_length]`.
+ valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+ Ratio of valid area in each feature level.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if inputs_embeds is not None:
+ hidden_states = inputs_embeds
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+ intermediate = ()
+ intermediate_reference_points = ()
+
+ for idx, decoder_layer in enumerate(self.layers):
+ num_coordinates = reference_points.shape[-1]
+ if num_coordinates == 4:
+ reference_points_input = (
+ reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+ )
+ elif reference_points.shape[-1] == 2:
+ reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+ else:
+ raise ValueError("Reference points' last dimension must be of size 2")
+
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ position_embeddings,
+ reference_points_input,
+ spatial_shapes,
+ spatial_shapes_list,
+ level_start_index,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ output_attentions,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ position_embeddings=position_embeddings,
+ encoder_hidden_states=encoder_hidden_states,
+ reference_points=reference_points_input,
+ spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
+ level_start_index=level_start_index,
+ encoder_attention_mask=encoder_attention_mask,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ # hack implementation for iterative bounding box refinement
+ if self.bbox_embed is not None:
+ tmp = self.bbox_embed[idx](hidden_states)
+ num_coordinates = reference_points.shape[-1]
+ if num_coordinates == 4:
+ new_reference_points = tmp + inverse_sigmoid(reference_points)
+ new_reference_points = new_reference_points.sigmoid()
+ elif num_coordinates == 2:
+ new_reference_points = tmp
+ new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+ new_reference_points = new_reference_points.sigmoid()
+ else:
+ raise ValueError(
+ f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}"
+ )
+ reference_points = new_reference_points.detach()
+
+ intermediate += (hidden_states,)
+ intermediate_reference_points += (reference_points,)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ if encoder_hidden_states is not None:
+ all_cross_attentions += (layer_outputs[2],)
+
+ # Keep batch_size as first dimension
+ intermediate = torch.stack(intermediate, dim=1)
+ intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if not return_dict:
+ return tuple(
+ v
+ for v in [
+ hidden_states,
+ intermediate,
+ intermediate_reference_points,
+ all_hidden_states,
+ all_self_attns,
+ all_cross_attentions,
+ ]
+ if v is not None
+ )
+ return DeformableDetrDecoderOutput(
+ last_hidden_state=hidden_states,
+ intermediate_hidden_states=intermediate,
+ intermediate_reference_points=intermediate_reference_points,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ cross_attentions=all_cross_attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+ hidden-states without any specific head on top.
+ """,
+ DEFORMABLE_DETR_START_DOCSTRING,
+)
+class DeformableDetrModel(DeformableDetrPreTrainedModel):
+ def __init__(self, config: DeformableDetrConfig):
+ super().__init__(config)
+
+ # Create backbone + positional encoding
+ backbone = DeformableDetrConvEncoder(config)
+ position_embeddings = build_position_encoding(config)
+ self.backbone = DeformableDetrConvModel(backbone, position_embeddings)
+
+ # Create input projection layers
+ if config.num_feature_levels > 1:
+ num_backbone_outs = len(backbone.intermediate_channel_sizes)
+ input_proj_list = []
+ for _ in range(num_backbone_outs):
+ in_channels = backbone.intermediate_channel_sizes[_]
+ input_proj_list.append(
+ nn.Sequential(
+ nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+ nn.GroupNorm(32, config.d_model),
+ )
+ )
+ for _ in range(config.num_feature_levels - num_backbone_outs):
+ input_proj_list.append(
+ nn.Sequential(
+ nn.Conv2d(
+ in_channels,
+ config.d_model,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ ),
+ nn.GroupNorm(32, config.d_model),
+ )
+ )
+ in_channels = config.d_model
+ self.input_proj = nn.ModuleList(input_proj_list)
+ else:
+ self.input_proj = nn.ModuleList(
+ [
+ nn.Sequential(
+ nn.Conv2d(
+ backbone.intermediate_channel_sizes[-1],
+ config.d_model,
+ kernel_size=1,
+ ),
+ nn.GroupNorm(32, config.d_model),
+ )
+ ]
+ )
+
+ if not config.two_stage:
+ self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
+
+ self.encoder = DeformableDetrEncoder(config)
+ self.decoder = DeformableDetrDecoder(config)
+
+ self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+
+ if config.two_stage:
+ self.enc_output = nn.Linear(config.d_model, config.d_model)
+ self.enc_output_norm = nn.LayerNorm(config.d_model)
+ self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
+ self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
+ else:
+ self.reference_points = nn.Linear(config.d_model, 2)
+
+ self.post_init()
+
+ def get_encoder(self):
+ return self.encoder
+
+ def get_decoder(self):
+ return self.decoder
+
+ def freeze_backbone(self):
+ for name, param in self.backbone.conv_encoder.model.named_parameters():
+ param.requires_grad_(False)
+
+ def unfreeze_backbone(self):
+ for name, param in self.backbone.conv_encoder.model.named_parameters():
+ param.requires_grad_(True)
+
+ def get_valid_ratio(self, mask, dtype=torch.float32):
+ """Get the valid ratio of all feature maps."""
+
+ _, height, width = mask.shape
+ valid_height = torch.sum(mask[:, :, 0], 1)
+ valid_width = torch.sum(mask[:, 0, :], 1)
+ valid_ratio_height = valid_height.to(dtype) / height
+ valid_ratio_width = valid_width.to(dtype) / width
+ valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
+ return valid_ratio
+
+ def get_proposal_pos_embed(self, proposals):
+ """Get the position embedding of the proposals."""
+
+ num_pos_feats = self.config.d_model // 2
+ temperature = 10000
+ scale = 2 * math.pi
+
+ dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device)
+ dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+ # batch_size, num_queries, 4
+ proposals = proposals.sigmoid() * scale
+ # batch_size, num_queries, 4, 128
+ pos = proposals[:, :, :, None] / dim_t
+ # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
+ pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+ return pos
+
+ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+ """Generate the encoder output proposals from encoded enc_output.
+
+ Args:
+ enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
+ padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
+ spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps.
+
+ Returns:
+ `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+ - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+ directly predict a bounding box. (without the need of a decoder)
+ - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+ sigmoid.
+ """
+ batch_size = enc_output.shape[0]
+ proposals = []
+ _cur = 0
+ for level, (height, width) in enumerate(spatial_shapes):
+ mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
+ valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+ valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+ grid_y, grid_x = meshgrid(
+ torch.linspace(
+ 0,
+ height - 1,
+ height,
+ dtype=enc_output.dtype,
+ device=enc_output.device,
+ ),
+ torch.linspace(
+ 0,
+ width - 1,
+ width,
+ dtype=enc_output.dtype,
+ device=enc_output.device,
+ ),
+ indexing="ij",
+ )
+ grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+ scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+ grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+ width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
+ proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+ proposals.append(proposal)
+ _cur += height * width
+ output_proposals = torch.cat(proposals, 1)
+ output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+ output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid
+ output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+ output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+ # assign each pixel as an object query
+ object_query = enc_output
+ object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+ object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+ object_query = self.enc_output_norm(self.enc_output(object_query))
+ return object_query, output_proposals
+
+ @add_start_docstrings_to_model_forward(DEFORMABLE_DETR_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=DeformableDetrModelOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: torch.FloatTensor,
+ pixel_mask: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_outputs: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.FloatTensor], DeformableDetrModelOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, DeformableDetrModel
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
+ >>> model = DeformableDetrModel.from_pretrained("SenseTime/deformable-detr")
+
+ >>> inputs = image_processor(images=image, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+
+ >>> last_hidden_states = outputs.last_hidden_state
+ >>> list(last_hidden_states.shape)
+ [1, 300, 256]
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ batch_size, num_channels, height, width = pixel_values.shape
+ device = pixel_values.device
+
+ if pixel_mask is None:
+ pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+
+ # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+ # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+ # which is a list of tuples
+ features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+ # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+ sources = []
+ masks = []
+ for level, (source, mask) in enumerate(features):
+ sources.append(self.input_proj[level](source))
+ masks.append(mask)
+ if mask is None:
+ raise ValueError("No attention mask was provided")
+
+ # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+ if self.config.num_feature_levels > len(sources):
+ _len_sources = len(sources)
+ for level in range(_len_sources, self.config.num_feature_levels):
+ if level == _len_sources:
+ source = self.input_proj[level](features[-1][0])
+ else:
+ source = self.input_proj[level](sources[-1])
+ mask = nn.functional.interpolate(pixel_mask[None].to(pixel_values.dtype), size=source.shape[-2:]).to(
+ torch.bool
+ )[0]
+ pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+ sources.append(source)
+ masks.append(mask)
+ position_embeddings_list.append(pos_l)
+
+ # Create queries
+ query_embeds = None
+ if not self.config.two_stage:
+ query_embeds = self.query_position_embeddings.weight
+
+ # Prepare encoder inputs (by flattening)
+ source_flatten = []
+ mask_flatten = []
+ lvl_pos_embed_flatten = []
+ spatial_shapes_list = []
+ for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
+ batch_size, num_channels, height, width = source.shape
+ spatial_shape = (height, width)
+ spatial_shapes_list.append(spatial_shape)
+ source = source.flatten(2).transpose(1, 2)
+ mask = mask.flatten(1)
+ pos_embed = pos_embed.flatten(2).transpose(1, 2)
+ lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+ lvl_pos_embed_flatten.append(lvl_pos_embed)
+ source_flatten.append(source)
+ mask_flatten.append(mask)
+ source_flatten = torch.cat(source_flatten, 1)
+ mask_flatten = torch.cat(mask_flatten, 1)
+ lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+ spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+ valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1)
+
+ # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+ # Also provide spatial_shapes, level_start_index and valid_ratios
+ if encoder_outputs is None:
+ encoder_outputs = self.encoder(
+ inputs_embeds=source_flatten,
+ attention_mask=mask_flatten,
+ position_embeddings=lvl_pos_embed_flatten,
+ spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
+ level_start_index=level_start_index,
+ valid_ratios=valid_ratios,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+ )
+
+ # Fifth, prepare decoder inputs
+ batch_size, _, num_channels = encoder_outputs[0].shape
+ enc_outputs_class = None
+ enc_outputs_coord_logits = None
+ if self.config.two_stage:
+ object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
+ encoder_outputs[0], ~mask_flatten, spatial_shapes_list
+ )
+
+ # hack implementation for two-stage Deformable DETR
+ # apply a detection head to each pixel (A.4 in paper)
+ # linear projection for bounding box binary classification (i.e. foreground and background)
+ enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
+ # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+ delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
+ enc_outputs_coord_logits = delta_bbox + output_proposals
+
+ # only keep top scoring `config.two_stage_num_proposals` proposals
+ topk = self.config.two_stage_num_proposals
+ topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+ topk_coords_logits = torch.gather(
+ enc_outputs_coord_logits,
+ 1,
+ topk_proposals.unsqueeze(-1).repeat(1, 1, 4),
+ )
+
+ topk_coords_logits = topk_coords_logits.detach()
+ reference_points = topk_coords_logits.sigmoid()
+ init_reference_points = reference_points
+ pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
+ query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
+ else:
+ query_embed, target = torch.split(query_embeds, num_channels, dim=1)
+ query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
+ target = target.unsqueeze(0).expand(batch_size, -1, -1)
+ reference_points = self.reference_points(query_embed).sigmoid()
+ init_reference_points = reference_points
+
+ decoder_outputs = self.decoder(
+ inputs_embeds=target,
+ position_embeddings=query_embed,
+ encoder_hidden_states=encoder_outputs[0],
+ encoder_attention_mask=mask_flatten,
+ reference_points=reference_points,
+ spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
+ level_start_index=level_start_index,
+ valid_ratios=valid_ratios,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ if not return_dict:
+ enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+ tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
+
+ return tuple_outputs
+
+ return DeformableDetrModelOutput(
+ init_reference_points=init_reference_points,
+ last_hidden_state=decoder_outputs.last_hidden_state,
+ intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+ intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ enc_outputs_class=enc_outputs_class,
+ enc_outputs_coord_logits=enc_outputs_coord_logits,
+ )
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
+class DeformableDetrMLPPredictionHead(nn.Module):
+ """
+ Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+ height and width of a bounding box w.r.t. an image.
+
+ Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+ """
+
+ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+ super().__init__()
+ self.num_layers = num_layers
+ h = [hidden_dim] * (num_layers - 1)
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+ def forward(self, x):
+ for i, layer in enumerate(self.layers):
+ x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+ return x
+
+
+@add_start_docstrings(
+ """
+ Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
+ top, for tasks such as COCO detection.
+ """,
+ DEFORMABLE_DETR_START_DOCSTRING,
+)
+class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
+ # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+ _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
+ # We can't initialize the model on meta device as some weights are modified during the initialization
+ _no_split_modules = None
+
+ def __init__(self, config: DeformableDetrConfig):
+ super().__init__(config)
+
+ # Deformable DETR encoder-decoder model
+ self.model = DeformableDetrModel(config)
+ # Detection heads on top
+ self.class_embed = nn.Linear(config.d_model, config.num_labels)
+ self.bbox_embed = DeformableDetrMLPPredictionHead(
+ input_dim=config.d_model,
+ hidden_dim=config.d_model,
+ output_dim=4,
+ num_layers=3,
+ )
+
+ # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+ num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
+ if config.with_box_refine:
+ self.class_embed = _get_clones(self.class_embed, num_pred)
+ self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+ # hack implementation for iterative bounding box refinement
+ self.model.decoder.bbox_embed = self.bbox_embed
+ else:
+ self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
+ self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+ self.model.decoder.bbox_embed = None
+ if config.two_stage:
+ # hack implementation for two-stage
+ self.model.decoder.class_embed = self.class_embed
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DEFORMABLE_DETR_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=DeformableDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: torch.FloatTensor,
+ pixel_mask: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_outputs: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[List[dict]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.FloatTensor], DeformableDetrObjectDetectionOutput]:
+ r"""
+ labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+ Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+ following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+ in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, DeformableDetrForObjectDetection
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
+ >>> model = DeformableDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr")
+
+ >>> inputs = image_processor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+
+ >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+ >>> target_sizes = torch.tensor([image.size[::-1]])
+ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
+ ... 0
+ ... ]
+ >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+ ... box = [round(i, 2) for i in box.tolist()]
+ ... print(
+ ... f"Detected {model.config.id2label[label.item()]} with confidence "
+ ... f"{round(score.item(), 3)} at location {box}"
+ ... )
+ Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
+ Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
+ Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # First, sent images through DETR base model to obtain encoder + decoder outputs
+ outputs = self.model(
+ pixel_values,
+ pixel_mask=pixel_mask,
+ decoder_attention_mask=decoder_attention_mask,
+ encoder_outputs=encoder_outputs,
+ inputs_embeds=inputs_embeds,
+ decoder_inputs_embeds=decoder_inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+ init_reference = outputs.init_reference_points if return_dict else outputs[0]
+ inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+ # class logits + predicted bounding boxes
+ outputs_classes = []
+ outputs_coords = []
+
+ for level in range(hidden_states.shape[1]):
+ if level == 0:
+ reference = init_reference
+ else:
+ reference = inter_references[:, level - 1]
+ reference = inverse_sigmoid(reference)
+ outputs_class = self.class_embed[level](hidden_states[:, level])
+ delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+ if reference.shape[-1] == 4:
+ outputs_coord_logits = delta_bbox + reference
+ elif reference.shape[-1] == 2:
+ delta_bbox[..., :2] += reference
+ outputs_coord_logits = delta_bbox
+ else:
+ raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+ outputs_coord = outputs_coord_logits.sigmoid()
+ outputs_classes.append(outputs_class)
+ outputs_coords.append(outputs_coord)
+ outputs_class = torch.stack(outputs_classes)
+ outputs_coord = torch.stack(outputs_coords)
+
+ logits = outputs_class[-1]
+ pred_boxes = outputs_coord[-1]
+
+ loss, loss_dict, auxiliary_outputs = None, None, None
+ if labels is not None:
+ loss, loss_dict, auxiliary_outputs = self.loss_function(
+ logits,
+ labels,
+ self.device,
+ pred_boxes,
+ self.config,
+ outputs_class,
+ outputs_coord,
+ )
+ if not return_dict:
+ if auxiliary_outputs is not None:
+ output = (logits, pred_boxes) + auxiliary_outputs + outputs
+ else:
+ output = (logits, pred_boxes) + outputs
+ tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+
+ return tuple_outputs
+
+ dict_outputs = DeformableDetrObjectDetectionOutput(
+ loss=loss,
+ loss_dict=loss_dict,
+ logits=logits,
+ pred_boxes=pred_boxes,
+ auxiliary_outputs=auxiliary_outputs,
+ last_hidden_state=outputs.last_hidden_state,
+ decoder_hidden_states=outputs.decoder_hidden_states,
+ decoder_attentions=outputs.decoder_attentions,
+ cross_attentions=outputs.cross_attentions,
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+ encoder_hidden_states=outputs.encoder_hidden_states,
+ encoder_attentions=outputs.encoder_attentions,
+ intermediate_hidden_states=outputs.intermediate_hidden_states,
+ intermediate_reference_points=outputs.intermediate_reference_points,
+ init_reference_points=outputs.init_reference_points,
+ enc_outputs_class=outputs.enc_outputs_class,
+ enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+ )
+
+ return dict_outputs
+
+
+__all__ = [
+ "DeformableDetrForObjectDetection",
+ "DeformableDetrModel",
+ "DeformableDetrPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/deformable_detr/modular_deformable_detr.py b/docs/transformers/build/lib/transformers/models/deformable_detr/modular_deformable_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dfb6048c3431b151ae1cf47157abf7e917c536c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deformable_detr/modular_deformable_detr.py
@@ -0,0 +1,144 @@
+from typing import List, Tuple, Union
+
+from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
+
+from ...image_transforms import center_to_corners_format
+from ...utils import (
+ TensorType,
+ is_torch_available,
+ logging,
+)
+
+
+if is_torch_available():
+ import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeformableDetrImageProcessorFast(DetrImageProcessorFast):
+ def post_process(self, outputs, target_sizes):
+ """
+ Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+ top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+ Args:
+ outputs ([`DeformableDetrObjectDetectionOutput`]):
+ Raw outputs of the model.
+ target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+ Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+ original image size (before any data augmentation). For visualization, this should be the image size
+ after data augment, but before padding.
+ Returns:
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+ in the batch as predicted by the model.
+ """
+ logger.warning_once(
+ "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+ " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+ )
+
+ out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+ if len(out_logits) != len(target_sizes):
+ raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+ if target_sizes.shape[1] != 2:
+ raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+ prob = out_logits.sigmoid()
+ topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+ scores = topk_values
+ topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+ labels = topk_indexes % out_logits.shape[2]
+ boxes = center_to_corners_format(out_bbox)
+ boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+ # and from relative [0, 1] to absolute [0, height] coordinates
+ img_h, img_w = target_sizes.unbind(1)
+ scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+ boxes = boxes * scale_fct[:, None, :]
+
+ results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+ return results
+
+ def post_process_object_detection(
+ self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+ ):
+ """
+ Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+ top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+ Args:
+ outputs ([`DetrObjectDetectionOutput`]):
+ Raw outputs of the model.
+ threshold (`float`, *optional*):
+ Score threshold to keep object detection predictions.
+ target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+ Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+ (height, width) of each image in the batch. If left to None, predictions will not be resized.
+ top_k (`int`, *optional*, defaults to 100):
+ Keep only top k bounding boxes before filtering by thresholding.
+
+ Returns:
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+ in the batch as predicted by the model.
+ """
+ out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+ if target_sizes is not None:
+ if len(out_logits) != len(target_sizes):
+ raise ValueError(
+ "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+ )
+
+ prob = out_logits.sigmoid()
+ prob = prob.view(out_logits.shape[0], -1)
+ k_value = min(top_k, prob.size(1))
+ topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+ scores = topk_values
+ topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+ labels = topk_indexes % out_logits.shape[2]
+ boxes = center_to_corners_format(out_bbox)
+ boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+ # and from relative [0, 1] to absolute [0, height] coordinates
+ if target_sizes is not None:
+ if isinstance(target_sizes, List):
+ img_h = torch.Tensor([i[0] for i in target_sizes])
+ img_w = torch.Tensor([i[1] for i in target_sizes])
+ else:
+ img_h, img_w = target_sizes.unbind(1)
+ scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+ boxes = boxes * scale_fct[:, None, :]
+
+ results = []
+ for s, l, b in zip(scores, labels, boxes):
+ score = s[s > threshold]
+ label = l[s > threshold]
+ box = b[s > threshold]
+ results.append({"scores": score, "labels": label, "boxes": box})
+
+ return results
+
+ def post_process_segmentation():
+ raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
+
+ def post_process_instance():
+ raise NotImplementedError("Instance post-processing is not implemented for Deformable DETR yet.")
+
+ def post_process_panoptic():
+ raise NotImplementedError("Panoptic post-processing is not implemented for Deformable DETR yet.")
+
+ def post_process_instance_segmentation():
+ raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
+
+ def post_process_semantic_segmentation():
+ raise NotImplementedError("Semantic segmentation post-processing is not implemented for Deformable DETR yet.")
+
+ def post_process_panoptic_segmentation():
+ raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Deformable DETR yet.")
+
+
+__all__ = ["DeformableDetrImageProcessorFast"]
diff --git a/docs/transformers/build/lib/transformers/models/deit/__init__.py b/docs/transformers/build/lib/transformers/models/deit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..98236a86d7a1e8b4ff16b53fb3ff37befbf1d7ac
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deit/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_deit import *
+ from .feature_extraction_deit import *
+ from .image_processing_deit import *
+ from .image_processing_deit_fast import *
+ from .modeling_deit import *
+ from .modeling_tf_deit import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/deit/configuration_deit.py b/docs/transformers/build/lib/transformers/models/deit/configuration_deit.py
new file mode 100644
index 0000000000000000000000000000000000000000..63d772067c5fd3e13ab0051b377751a873c7f03b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deit/configuration_deit.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2021 Facebook AI Research (FAIR) and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DeiT model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeiTConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DeiTModel`]. It is used to instantiate an DeiT
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the DeiT
+ [facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224)
+ architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ intermediate_size (`int`, *optional*, defaults to 3072):
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the layer normalization layers.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 16):
+ The size (resolution) of each patch.
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ qkv_bias (`bool`, *optional*, defaults to `True`):
+ Whether to add a bias to the queries, keys and values.
+ encoder_stride (`int`, *optional*, defaults to 16):
+ Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+ pooler_output_size (`int`, *optional*):
+ Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
+ pooler_act (`str`, *optional*, defaults to `"tanh"`):
+ The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
+ Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
+ supported for Tensorflow.
+
+ Example:
+
+ ```python
+ >>> from transformers import DeiTConfig, DeiTModel
+
+ >>> # Initializing a DeiT deit-base-distilled-patch16-224 style configuration
+ >>> configuration = DeiTConfig()
+
+ >>> # Initializing a model (with random weights) from the deit-base-distilled-patch16-224 style configuration
+ >>> model = DeiTModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "deit"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ intermediate_size=3072,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.0,
+ attention_probs_dropout_prob=0.0,
+ initializer_range=0.02,
+ layer_norm_eps=1e-12,
+ image_size=224,
+ patch_size=16,
+ num_channels=3,
+ qkv_bias=True,
+ encoder_stride=16,
+ pooler_output_size=None,
+ pooler_act="tanh",
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.initializer_range = initializer_range
+ self.layer_norm_eps = layer_norm_eps
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.qkv_bias = qkv_bias
+ self.encoder_stride = encoder_stride
+ self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
+ self.pooler_act = pooler_act
+
+
+class DeiTOnnxConfig(OnnxConfig):
+ torch_onnx_minimum_version = version.parse("1.11")
+
+ @property
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
+ return OrderedDict(
+ [
+ ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+ ]
+ )
+
+ @property
+ def atol_for_validation(self) -> float:
+ return 1e-4
+
+
+__all__ = ["DeiTConfig", "DeiTOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/deit/convert_deit_timm_to_pytorch.py b/docs/transformers/build/lib/transformers/models/deit/convert_deit_timm_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7bf3e7a12e8ac307c730c61ed35e1630edf7637
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deit/convert_deit_timm_to_pytorch.py
@@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DeiT distilled checkpoints from the timm library."""
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import timm
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config, base_model=False):
+ rename_keys = []
+ for i in range(config.num_hidden_layers):
+ # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+ rename_keys.append((f"blocks.{i}.norm1.weight", f"deit.encoder.layer.{i}.layernorm_before.weight"))
+ rename_keys.append((f"blocks.{i}.norm1.bias", f"deit.encoder.layer.{i}.layernorm_before.bias"))
+ rename_keys.append((f"blocks.{i}.attn.proj.weight", f"deit.encoder.layer.{i}.attention.output.dense.weight"))
+ rename_keys.append((f"blocks.{i}.attn.proj.bias", f"deit.encoder.layer.{i}.attention.output.dense.bias"))
+ rename_keys.append((f"blocks.{i}.norm2.weight", f"deit.encoder.layer.{i}.layernorm_after.weight"))
+ rename_keys.append((f"blocks.{i}.norm2.bias", f"deit.encoder.layer.{i}.layernorm_after.bias"))
+ rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"deit.encoder.layer.{i}.intermediate.dense.weight"))
+ rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"deit.encoder.layer.{i}.intermediate.dense.bias"))
+ rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"deit.encoder.layer.{i}.output.dense.weight"))
+ rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"deit.encoder.layer.{i}.output.dense.bias"))
+
+ # projection layer + position embeddings
+ rename_keys.extend(
+ [
+ ("cls_token", "deit.embeddings.cls_token"),
+ ("dist_token", "deit.embeddings.distillation_token"),
+ ("patch_embed.proj.weight", "deit.embeddings.patch_embeddings.projection.weight"),
+ ("patch_embed.proj.bias", "deit.embeddings.patch_embeddings.projection.bias"),
+ ("pos_embed", "deit.embeddings.position_embeddings"),
+ ]
+ )
+
+ if base_model:
+ # layernorm + pooler
+ rename_keys.extend(
+ [
+ ("norm.weight", "layernorm.weight"),
+ ("norm.bias", "layernorm.bias"),
+ ("pre_logits.fc.weight", "pooler.dense.weight"),
+ ("pre_logits.fc.bias", "pooler.dense.bias"),
+ ]
+ )
+
+ # if just the base model, we should remove "deit" from all keys that start with "deit"
+ rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("deit") else pair for pair in rename_keys]
+ else:
+ # layernorm + classification heads
+ rename_keys.extend(
+ [
+ ("norm.weight", "deit.layernorm.weight"),
+ ("norm.bias", "deit.layernorm.bias"),
+ ("head.weight", "cls_classifier.weight"),
+ ("head.bias", "cls_classifier.bias"),
+ ("head_dist.weight", "distillation_classifier.weight"),
+ ("head_dist.bias", "distillation_classifier.bias"),
+ ]
+ )
+
+ return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config, base_model=False):
+ for i in range(config.num_hidden_layers):
+ if base_model:
+ prefix = ""
+ else:
+ prefix = "deit."
+ # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+ in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+ in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+ # next, add query, keys and values (in that order) to the state dict
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+ : config.hidden_size, :
+ ]
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+ config.hidden_size : config.hidden_size * 2, :
+ ]
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+ config.hidden_size : config.hidden_size * 2
+ ]
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+ -config.hidden_size :, :
+ ]
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+def rename_key(dct, old, new):
+ val = dct.pop(old)
+ dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ im = Image.open(requests.get(url, stream=True).raw)
+ return im
+
+
+@torch.no_grad()
+def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
+ """
+ Copy/paste/tweak model's weights to our DeiT structure.
+ """
+
+ # define default DeiT configuration
+ config = DeiTConfig()
+ # all deit models have fine-tuned heads
+ base_model = False
+ # dataset (fine-tuned on ImageNet 2012), patch_size and image_size
+ config.num_labels = 1000
+ repo_id = "huggingface/label-files"
+ filename = "imagenet-1k-id2label.json"
+ id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+ id2label = {int(k): v for k, v in id2label.items()}
+ config.id2label = id2label
+ config.label2id = {v: k for k, v in id2label.items()}
+ config.patch_size = int(deit_name[-6:-4])
+ config.image_size = int(deit_name[-3:])
+ # size of the architecture
+ if deit_name[9:].startswith("tiny"):
+ config.hidden_size = 192
+ config.intermediate_size = 768
+ config.num_hidden_layers = 12
+ config.num_attention_heads = 3
+ elif deit_name[9:].startswith("small"):
+ config.hidden_size = 384
+ config.intermediate_size = 1536
+ config.num_hidden_layers = 12
+ config.num_attention_heads = 6
+ if deit_name[9:].startswith("base"):
+ pass
+ elif deit_name[4:].startswith("large"):
+ config.hidden_size = 1024
+ config.intermediate_size = 4096
+ config.num_hidden_layers = 24
+ config.num_attention_heads = 16
+
+ # load original model from timm
+ timm_model = timm.create_model(deit_name, pretrained=True)
+ timm_model.eval()
+
+ # load state_dict of original model, remove and rename some keys
+ state_dict = timm_model.state_dict()
+ rename_keys = create_rename_keys(config, base_model)
+ for src, dest in rename_keys:
+ rename_key(state_dict, src, dest)
+ read_in_q_k_v(state_dict, config, base_model)
+
+ # load HuggingFace model
+ model = DeiTForImageClassificationWithTeacher(config).eval()
+ model.load_state_dict(state_dict)
+
+ # Check outputs on an image, prepared by DeiTImageProcessor
+ size = int(
+ (256 / 224) * config.image_size
+ ) # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
+ image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
+ encoding = image_processor(images=prepare_img(), return_tensors="pt")
+ pixel_values = encoding["pixel_values"]
+ outputs = model(pixel_values)
+
+ timm_logits = timm_model(pixel_values)
+ assert timm_logits.shape == outputs.logits.shape
+ assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
+
+ Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+ print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
+ model.save_pretrained(pytorch_dump_folder_path)
+ print(f"Saving image processor to {pytorch_dump_folder_path}")
+ image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ # Required parameters
+ parser.add_argument(
+ "--deit_name",
+ default="vit_deit_base_distilled_patch16_224",
+ type=str,
+ help="Name of the DeiT timm model you'd like to convert.",
+ )
+ parser.add_argument(
+ "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+ )
+
+ args = parser.parse_args()
+ convert_deit_checkpoint(args.deit_name, args.pytorch_dump_folder_path)
diff --git a/docs/transformers/build/lib/transformers/models/deit/feature_extraction_deit.py b/docs/transformers/build/lib/transformers/models/deit/feature_extraction_deit.py
new file mode 100644
index 0000000000000000000000000000000000000000..d040fd08395f8e921ec688228d7d5faa8963ab81
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deit/feature_extraction_deit.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for DeiT."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_deit import DeiTImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class DeiTFeatureExtractor(DeiTImageProcessor):
+ def __init__(self, *args, **kwargs) -> None:
+ warnings.warn(
+ "The class DeiTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+ " use DeiTImageProcessor instead.",
+ FutureWarning,
+ )
+ super().__init__(*args, **kwargs)
+
+
+__all__ = ["DeiTFeatureExtractor"]
diff --git a/docs/transformers/build/lib/transformers/models/deit/image_processing_deit_fast.py b/docs/transformers/build/lib/transformers/models/deit/image_processing_deit_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..28cd6539df795d230abca5c70d4d8e07922b869c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deit/image_processing_deit_fast.py
@@ -0,0 +1,44 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for DeiT."""
+
+from ...image_processing_utils_fast import BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast
+from ...image_utils import (
+ IMAGENET_STANDARD_MEAN,
+ IMAGENET_STANDARD_STD,
+ PILImageResampling,
+)
+from ...utils import add_start_docstrings
+
+
+@add_start_docstrings(
+ "Constructs a fast DeiT image processor.",
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+)
+class DeiTImageProcessorFast(BaseImageProcessorFast):
+ # To be checked against the slow image processor
+ # None values left after checking can be removed
+ resample = PILImageResampling.BICUBIC
+ image_mean = IMAGENET_STANDARD_MEAN
+ image_std = IMAGENET_STANDARD_STD
+ size = {"height": 256, "width": 256}
+ crop_size = {"height": 224, "width": 224}
+ do_resize = True
+ do_center_crop = True
+ do_rescale = True
+ do_normalize = True
+
+
+__all__ = ["DeiTImageProcessorFast"]
diff --git a/docs/transformers/build/lib/transformers/models/deit/modeling_deit.py b/docs/transformers/build/lib/transformers/models/deit/modeling_deit.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb88bca353fc551afe959231476e64c97cd30206
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deit/modeling_deit.py
@@ -0,0 +1,999 @@
+# coding=utf-8
+# Copyright 2021 Facebook AI Research (FAIR), Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DeiT model."""
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Callable, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+ BaseModelOutput,
+ BaseModelOutputWithPooling,
+ ImageClassifierOutput,
+ MaskedImageModelingOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+ ModelOutput,
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+ torch_int,
+)
+from .configuration_deit import DeiTConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "DeiTConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 198, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/deit-base-distilled-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+class DeiTEmbeddings(nn.Module):
+ """
+ Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
+ """
+
+ def __init__(self, config: DeiTConfig, use_mask_token: bool = False) -> None:
+ super().__init__()
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+ self.distillation_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+ self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+ self.patch_embeddings = DeiTPatchEmbeddings(config)
+ num_patches = self.patch_embeddings.num_patches
+ self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 2, config.hidden_size))
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.patch_size = config.patch_size
+
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ """
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+ images. This method is also adapted to support torch.jit tracing and 2 class embeddings.
+
+ Adapted from:
+ - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+ - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+ """
+
+ num_patches = embeddings.shape[1] - 2
+ num_positions = self.position_embeddings.shape[1] - 2
+
+ # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+ if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+ return self.position_embeddings
+
+ class_and_dist_pos_embed = self.position_embeddings[:, :2]
+ patch_pos_embed = self.position_embeddings[:, 2:]
+
+ dim = embeddings.shape[-1]
+
+ new_height = height // self.patch_size
+ new_width = width // self.patch_size
+
+ sqrt_num_positions = torch_int(num_positions**0.5)
+ patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed,
+ size=(new_height, new_width),
+ mode="bicubic",
+ align_corners=False,
+ )
+
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+ return torch.cat((class_and_dist_pos_embed, patch_pos_embed), dim=1)
+
+ def forward(
+ self,
+ pixel_values: torch.Tensor,
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
+ interpolate_pos_encoding: bool = False,
+ ) -> torch.Tensor:
+ _, _, height, width = pixel_values.shape
+ embeddings = self.patch_embeddings(pixel_values)
+
+ batch_size, seq_length, _ = embeddings.size()
+
+ if bool_masked_pos is not None:
+ mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+ # replace the masked visual tokens by mask_tokens
+ mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+ embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+
+ distillation_tokens = self.distillation_token.expand(batch_size, -1, -1)
+
+ embeddings = torch.cat((cls_tokens, distillation_tokens, embeddings), dim=1)
+ position_embedding = self.position_embeddings
+
+ if interpolate_pos_encoding:
+ position_embedding = self.interpolate_pos_encoding(embeddings, height, width)
+
+ embeddings = embeddings + position_embedding
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class DeiTPatchEmbeddings(nn.Module):
+ """
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+ `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+ Transformer.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ image_size, patch_size = config.image_size, config.patch_size
+ num_channels, hidden_size = config.num_channels, config.hidden_size
+
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.num_patches = num_patches
+
+ self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+ batch_size, num_channels, height, width = pixel_values.shape
+ if num_channels != self.num_channels:
+ raise ValueError(
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+ )
+ x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+ return x
+
+
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+ module: nn.Module,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ value: torch.Tensor,
+ attention_mask: Optional[torch.Tensor],
+ scaling: float,
+ dropout: float = 0.0,
+ **kwargs,
+):
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+ # Normalize the attention scores to probabilities.
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+ # Mask heads if we want to
+ if attention_mask is not None:
+ attn_weights = attn_weights * attention_mask
+
+ attn_output = torch.matmul(attn_weights, value)
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ return attn_output, attn_weights
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DeiT
+class DeiTSelfAttention(nn.Module):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__()
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+ raise ValueError(
+ f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+ f"heads {config.num_attention_heads}."
+ )
+
+ self.config = config
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.dropout_prob = config.attention_probs_dropout_prob
+ self.scaling = self.attention_head_size**-0.5
+ self.is_causal = False
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+ self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+ x = x.view(new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+ attention_interface: Callable = eager_attention_forward
+ if self.config._attn_implementation != "eager":
+ if self.config._attn_implementation == "sdpa" and output_attentions:
+ logger.warning_once(
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ else:
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+ context_layer, attention_probs = attention_interface(
+ self,
+ query_layer,
+ key_layer,
+ value_layer,
+ head_mask,
+ is_causal=self.is_causal,
+ scaling=self.scaling,
+ dropout=0.0 if not self.training else self.dropout_prob,
+ )
+
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.reshape(new_context_layer_shape)
+
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+ return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DeiT
+class DeiTSelfOutput(nn.Module):
+ """
+ The residual connection is defined in DeiTLayer instead of here (as is the case with other models), due to the
+ layernorm applied before each block.
+ """
+
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DeiT
+class DeiTAttention(nn.Module):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__()
+ self.attention = DeiTSelfAttention(config)
+ self.output = DeiTSelfOutput(config)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads: Set[int]) -> None:
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+ )
+
+ # Prune linear layers
+ self.attention.query = prune_linear_layer(self.attention.query, index)
+ self.attention.key = prune_linear_layer(self.attention.key, index)
+ self.attention.value = prune_linear_layer(self.attention.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+ self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+ attention_output = self.output(self_outputs[0], hidden_states)
+
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+ return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->DeiT
+class DeiTIntermediate(nn.Module):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+
+ return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->DeiT
+class DeiTOutput(nn.Module):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ hidden_states = hidden_states + input_tensor
+
+ return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->DeiT,VIT->DEIT
+class DeiTLayer(nn.Module):
+ """This corresponds to the Block class in the timm implementation."""
+
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__()
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ self.seq_len_dim = 1
+ self.attention = DeiTAttention(config)
+ self.intermediate = DeiTIntermediate(config)
+ self.output = DeiTOutput(config)
+ self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ self_attention_outputs = self.attention(
+ self.layernorm_before(hidden_states), # in DeiT, layernorm is applied before self-attention
+ head_mask,
+ output_attentions=output_attentions,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
+
+ # first residual connection
+ hidden_states = attention_output + hidden_states
+
+ # in DeiT, layernorm is also applied after self-attention
+ layer_output = self.layernorm_after(hidden_states)
+ layer_output = self.intermediate(layer_output)
+
+ # second residual connection is done here
+ layer_output = self.output(layer_output, hidden_states)
+
+ outputs = (layer_output,) + outputs
+
+ return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DeiT
+class DeiTEncoder(nn.Module):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__()
+ self.config = config
+ self.layer = nn.ModuleList([DeiTLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ ) -> Union[tuple, BaseModelOutput]:
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ hidden_states,
+ layer_head_mask,
+ output_attentions,
+ )
+ else:
+ layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+class DeiTPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = DeiTConfig
+ base_model_prefix = "deit"
+ main_input_name = "pixel_values"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["DeiTLayer"]
+ _supports_sdpa = True
+ _supports_flash_attn_2 = True
+
+ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
+ # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+ # `trunc_normal_cpu` not implemented in `half` issues
+ module.weight.data = nn.init.trunc_normal_(
+ module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+ ).to(module.weight.dtype)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, DeiTEmbeddings):
+ module.cls_token.data.zero_()
+ module.position_embeddings.data.zero_()
+ module.distillation_token.data.zero_()
+ if module.mask_token is not None:
+ module.mask_token.data.zero_()
+
+
+DEIT_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+ as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEIT_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+ [`DeiTImageProcessor.__call__`] for details.
+
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+ Whether to interpolate the pre-trained position encodings.
+"""
+
+
+@add_start_docstrings(
+ "The bare DeiT Model transformer outputting raw hidden-states without any specific head on top.",
+ DEIT_START_DOCSTRING,
+)
+class DeiTModel(DeiTPreTrainedModel):
+ def __init__(self, config: DeiTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False) -> None:
+ super().__init__(config)
+ self.config = config
+
+ self.embeddings = DeiTEmbeddings(config, use_mask_token=use_mask_token)
+ self.encoder = DeiTEncoder(config)
+
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.pooler = DeiTPooler(config) if add_pooling_layer else None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> DeiTPatchEmbeddings:
+ return self.embeddings.patch_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutputWithPooling,
+ config_class=_CONFIG_FOR_DOC,
+ modality="vision",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+ # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
+ expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+ if pixel_values.dtype != expected_dtype:
+ pixel_values = pixel_values.to(expected_dtype)
+
+ embedding_output = self.embeddings(
+ pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+ )
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ sequence_output = encoder_outputs[0]
+ sequence_output = self.layernorm(sequence_output)
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+ if not return_dict:
+ head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+ return head_outputs + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->DeiT
+class DeiTPooler(nn.Module):
+ def __init__(self, config: DeiTConfig):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.pooler_output_size)
+ self.activation = ACT2FN[config.pooler_act]
+
+ def forward(self, hidden_states):
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ first_token_tensor = hidden_states[:, 0]
+ pooled_output = self.dense(first_token_tensor)
+ pooled_output = self.activation(pooled_output)
+ return pooled_output
+
+
+@add_start_docstrings(
+ """DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).
+
+
+
+ Note that we provide a script to pre-train this model on custom data in our [examples
+ directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+
+ """,
+ DEIT_START_DOCSTRING,
+)
+class DeiTForMaskedImageModeling(DeiTPreTrainedModel):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__(config)
+
+ self.deit = DeiTModel(config, add_pooling_layer=False, use_mask_token=True)
+
+ self.decoder = nn.Sequential(
+ nn.Conv2d(
+ in_channels=config.hidden_size,
+ out_channels=config.encoder_stride**2 * config.num_channels,
+ kernel_size=1,
+ ),
+ nn.PixelShuffle(config.encoder_stride),
+ )
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ ) -> Union[tuple, MaskedImageModelingOutput]:
+ r"""
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+ Returns:
+
+ Examples:
+ ```python
+ >>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
+ >>> import torch
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+ >>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")
+
+ >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
+ >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+ >>> # create random boolean mask of shape (batch_size, num_patches)
+ >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
+
+ >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+ >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
+ >>> list(reconstructed_pixel_values.shape)
+ [1, 3, 224, 224]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deit(
+ pixel_values,
+ bool_masked_pos=bool_masked_pos,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ )
+
+ sequence_output = outputs[0]
+
+ # Reshape to (batch_size, num_channels, height, width)
+ sequence_output = sequence_output[:, 1:-1]
+ batch_size, sequence_length, num_channels = sequence_output.shape
+ height = width = int(sequence_length**0.5)
+ sequence_output = sequence_output.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+
+ # Reconstruct pixel values
+ reconstructed_pixel_values = self.decoder(sequence_output)
+
+ masked_im_loss = None
+ if bool_masked_pos is not None:
+ size = self.config.image_size // self.config.patch_size
+ bool_masked_pos = bool_masked_pos.reshape(-1, size, size)
+ mask = (
+ bool_masked_pos.repeat_interleave(self.config.patch_size, 1)
+ .repeat_interleave(self.config.patch_size, 2)
+ .unsqueeze(1)
+ .contiguous()
+ )
+ reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none")
+ masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels
+
+ if not return_dict:
+ output = (reconstructed_pixel_values,) + outputs[1:]
+ return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+
+ return MaskedImageModelingOutput(
+ loss=masked_im_loss,
+ reconstruction=reconstructed_pixel_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+ the [CLS] token) e.g. for ImageNet.
+ """,
+ DEIT_START_DOCSTRING,
+)
+class DeiTForImageClassification(DeiTPreTrainedModel):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.deit = DeiTModel(config, add_pooling_layer=False)
+
+ # Classifier head
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ ) -> Union[tuple, ImageClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, DeiTForImageClassification
+ >>> import torch
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> torch.manual_seed(3) # doctest: +IGNORE_RESULT
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
+ >>> # so the head will be randomly initialized, hence the predictions will be random
+ >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+ >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")
+
+ >>> inputs = image_processor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ Predicted class: Polaroid camera, Polaroid Land camera
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deit(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ )
+
+ sequence_output = outputs[0]
+
+ logits = self.classifier(sequence_output[:, 0, :])
+ # we don't use the distillation token
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return ImageClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+@dataclass
+class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
+ """
+ Output type of [`DeiTForImageClassificationWithTeacher`].
+
+ Args:
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+ Prediction scores as the average of the cls_logits and distillation logits.
+ cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+ Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+ class token).
+ distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+ Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+ distillation token).
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+ plus the initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ the self-attention heads.
+ """
+
+ logits: Optional[torch.FloatTensor] = None
+ cls_logits: Optional[torch.FloatTensor] = None
+ distillation_logits: Optional[torch.FloatTensor] = None
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@add_start_docstrings(
+ """
+ DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
+ the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.
+
+ .. warning::
+
+ This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+ supported.
+ """,
+ DEIT_START_DOCSTRING,
+)
+class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.deit = DeiTModel(config, add_pooling_layer=False)
+
+ # Classifier heads
+ self.cls_classifier = (
+ nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+ )
+ self.distillation_classifier = (
+ nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+ )
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
+ output_type=DeiTForImageClassificationWithTeacherOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ ) -> Union[tuple, DeiTForImageClassificationWithTeacherOutput]:
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deit(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ )
+
+ sequence_output = outputs[0]
+
+ cls_logits = self.cls_classifier(sequence_output[:, 0, :])
+ distillation_logits = self.distillation_classifier(sequence_output[:, 1, :])
+
+ # during inference, return the average of both classifier predictions
+ logits = (cls_logits + distillation_logits) / 2
+
+ if not return_dict:
+ output = (logits, cls_logits, distillation_logits) + outputs[1:]
+ return output
+
+ return DeiTForImageClassificationWithTeacherOutput(
+ logits=logits,
+ cls_logits=cls_logits,
+ distillation_logits=distillation_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = [
+ "DeiTForImageClassification",
+ "DeiTForImageClassificationWithTeacher",
+ "DeiTForMaskedImageModeling",
+ "DeiTModel",
+ "DeiTPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/deit/modeling_tf_deit.py b/docs/transformers/build/lib/transformers/models/deit/modeling_tf_deit.py
new file mode 100644
index 0000000000000000000000000000000000000000..49c95268035a98c55e45071080ef748bce1d17ba
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deit/modeling_tf_deit.py
@@ -0,0 +1,1233 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI Research (FAIR) and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow DeiT model."""
+
+from __future__ import annotations
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+ TFBaseModelOutput,
+ TFBaseModelOutputWithPooling,
+ TFImageClassifierOutput,
+ TFMaskedImageModelingOutput,
+)
+from ...modeling_tf_utils import (
+ TFPreTrainedModel,
+ TFSequenceClassificationLoss,
+ get_initializer,
+ keras,
+ keras_serializable,
+ unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+ ModelOutput,
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_deit import DeiTConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "DeiTConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 198, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/deit-base-distilled-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+@dataclass
+class TFDeiTForImageClassificationWithTeacherOutput(ModelOutput):
+ """
+ Output type of [`DeiTForImageClassificationWithTeacher`].
+
+ Args:
+ logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+ Prediction scores as the average of the cls_logits and distillation logits.
+ cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+ Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+ class token).
+ distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+ Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+ distillation token).
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+ `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+ the initial embedding outputs.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ the self-attention heads.
+ """
+
+ logits: Optional[tf.Tensor] = None
+ cls_logits: Optional[tf.Tensor] = None
+ distillation_logits: Optional[tf.Tensor] = None
+ hidden_states: Tuple[tf.Tensor] | None = None
+ attentions: Tuple[tf.Tensor] | None = None
+
+
+class TFDeiTEmbeddings(keras.layers.Layer):
+ """
+ Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
+ """
+
+ def __init__(self, config: DeiTConfig, use_mask_token: bool = False, **kwargs) -> None:
+ super().__init__(**kwargs)
+ self.config = config
+ self.use_mask_token = use_mask_token
+ self.patch_embeddings = TFDeiTPatchEmbeddings(config=config, name="patch_embeddings")
+ self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+
+ def build(self, input_shape=None):
+ self.cls_token = self.add_weight(
+ shape=(1, 1, self.config.hidden_size),
+ initializer=keras.initializers.zeros(),
+ trainable=True,
+ name="cls_token",
+ )
+ self.distillation_token = self.add_weight(
+ shape=(1, 1, self.config.hidden_size),
+ initializer=keras.initializers.zeros(),
+ trainable=True,
+ name="distillation_token",
+ )
+ self.mask_token = None
+ if self.use_mask_token:
+ self.mask_token = self.add_weight(
+ shape=(1, 1, self.config.hidden_size),
+ initializer=keras.initializers.zeros(),
+ trainable=True,
+ name="mask_token",
+ )
+ num_patches = self.patch_embeddings.num_patches
+ self.position_embeddings = self.add_weight(
+ shape=(1, num_patches + 2, self.config.hidden_size),
+ initializer=keras.initializers.zeros(),
+ trainable=True,
+ name="position_embeddings",
+ )
+
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "patch_embeddings", None) is not None:
+ with tf.name_scope(self.patch_embeddings.name):
+ self.patch_embeddings.build(None)
+ if getattr(self, "dropout", None) is not None:
+ with tf.name_scope(self.dropout.name):
+ self.dropout.build(None)
+
+ def interpolate_pos_encoding(self, embeddings: tf.Tensor, height: int, width: int) -> tf.Tensor:
+ num_patches = embeddings.shape[1] - 2
+ num_positions = self.position_embeddings.shape[1] - 2
+
+ if num_patches == num_positions and height == width:
+ return self.position_embeddings
+
+ class_pos_embed = self.position_embeddings[:, 0, :]
+ dist_pos_embed = self.position_embeddings[:, 1, :]
+ patch_pos_embed = self.position_embeddings[:, 2:, :]
+ dim = embeddings.shape[-1]
+ h0 = height // self.config.patch_size
+ w0 = width // self.config.patch_size
+ # # we add a small number to avoid floating point error in the interpolation
+ # # see discussion at https://github.com/facebookresearch/dino/issues/8
+ h0, w0 = h0 + 0.1, w0 + 0.1
+ patch_pos_embed = tf.reshape(
+ patch_pos_embed, (1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+ )
+ patch_pos_embed = tf.image.resize(patch_pos_embed, size=(int(h0), int(w0)), method="bicubic")
+ patch_pos_embed = tf.transpose(patch_pos_embed, perm=[0, 2, 3, 1])
+ patch_pos_embed = tf.reshape(patch_pos_embed, (1, -1, dim))
+
+ return tf.concat(
+ [tf.expand_dims(class_pos_embed, axis=0), tf.expand_dims(dist_pos_embed, axis=0), patch_pos_embed], axis=1
+ )
+
+ def call(
+ self,
+ pixel_values: tf.Tensor,
+ bool_masked_pos: tf.Tensor | None = None,
+ training: bool = False,
+ interpolate_pos_encoding: bool = False,
+ ) -> tf.Tensor:
+ _, height, width, _ = pixel_values.shape
+
+ embeddings = self.patch_embeddings(pixel_values)
+ batch_size, seq_length, _ = shape_list(embeddings)
+
+ if bool_masked_pos is not None:
+ mask_tokens = tf.tile(self.mask_token, [batch_size, seq_length, 1])
+ # replace the masked visual tokens by mask_tokens
+ mask = tf.expand_dims(bool_masked_pos, axis=-1)
+ mask = tf.cast(mask, dtype=mask_tokens.dtype)
+ embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+ cls_tokens = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
+ distillation_tokens = tf.repeat(self.distillation_token, repeats=batch_size, axis=0)
+ embeddings = tf.concat((cls_tokens, distillation_tokens, embeddings), axis=1)
+ position_embedding = self.position_embeddings
+ if interpolate_pos_encoding:
+ position_embedding = self.interpolate_pos_encoding(embeddings, height, width)
+
+ embeddings = embeddings + position_embedding
+ embeddings = self.dropout(embeddings, training=training)
+ return embeddings
+
+
+class TFDeiTPatchEmbeddings(keras.layers.Layer):
+ """
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+ `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+ Transformer.
+ """
+
+ def __init__(self, config: DeiTConfig, **kwargs) -> None:
+ super().__init__(**kwargs)
+ image_size, patch_size = config.image_size, config.patch_size
+ num_channels, hidden_size = config.num_channels, config.hidden_size
+
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.num_patches = num_patches
+
+ self.projection = keras.layers.Conv2D(
+ hidden_size, kernel_size=patch_size, strides=patch_size, name="projection"
+ )
+
+ def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
+ batch_size, height, width, num_channels = shape_list(pixel_values)
+ if tf.executing_eagerly() and num_channels != self.num_channels:
+ raise ValueError(
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+ )
+
+ x = self.projection(pixel_values)
+ batch_size, height, width, num_channels = shape_list(x)
+ x = tf.reshape(x, (batch_size, height * width, num_channels))
+ return x
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "projection", None) is not None:
+ with tf.name_scope(self.projection.name):
+ self.projection.build([None, None, None, self.num_channels])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->DeiT
+class TFDeiTSelfAttention(keras.layers.Layer):
+ def __init__(self, config: DeiTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ if config.hidden_size % config.num_attention_heads != 0:
+ raise ValueError(
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+ f"of attention heads ({config.num_attention_heads})"
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+ self.query = keras.layers.Dense(
+ units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+ )
+ self.key = keras.layers.Dense(
+ units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+ )
+ self.value = keras.layers.Dense(
+ units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+ )
+ self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+ self.config = config
+
+ def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+ # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+ tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+ # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+ return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ head_mask: tf.Tensor,
+ output_attentions: bool,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ batch_size = shape_list(hidden_states)[0]
+ mixed_query_layer = self.query(inputs=hidden_states)
+ mixed_key_layer = self.key(inputs=hidden_states)
+ mixed_value_layer = self.value(inputs=hidden_states)
+ query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+ key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+ value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ # (batch size, num_heads, seq_len_q, seq_len_k)
+ attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+ dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+ attention_scores = tf.divide(attention_scores, dk)
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = tf.multiply(attention_probs, head_mask)
+
+ attention_output = tf.matmul(attention_probs, value_layer)
+ attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+ # (batch_size, seq_len_q, all_head_size)
+ attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+ outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "query", None) is not None:
+ with tf.name_scope(self.query.name):
+ self.query.build([None, None, self.config.hidden_size])
+ if getattr(self, "key", None) is not None:
+ with tf.name_scope(self.key.name):
+ self.key.build([None, None, self.config.hidden_size])
+ if getattr(self, "value", None) is not None:
+ with tf.name_scope(self.value.name):
+ self.value.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->DeiT
+class TFDeiTSelfOutput(keras.layers.Layer):
+ """
+ The residual connection is defined in TFDeiTLayer instead of here (as is the case with other models), due to the
+ layernorm applied before each block.
+ """
+
+ def __init__(self, config: DeiTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(inputs=hidden_states, training=training)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->DeiT
+class TFDeiTAttention(keras.layers.Layer):
+ def __init__(self, config: DeiTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.self_attention = TFDeiTSelfAttention(config, name="attention")
+ self.dense_output = TFDeiTSelfOutput(config, name="output")
+
+ def prune_heads(self, heads):
+ raise NotImplementedError
+
+ def call(
+ self,
+ input_tensor: tf.Tensor,
+ head_mask: tf.Tensor,
+ output_attentions: bool,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ self_outputs = self.self_attention(
+ hidden_states=input_tensor, head_mask=head_mask, output_attentions=output_attentions, training=training
+ )
+ attention_output = self.dense_output(
+ hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+ )
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "self_attention", None) is not None:
+ with tf.name_scope(self.self_attention.name):
+ self.self_attention.build(None)
+ if getattr(self, "dense_output", None) is not None:
+ with tf.name_scope(self.dense_output.name):
+ self.dense_output.build(None)
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->DeiT
+class TFDeiTIntermediate(keras.layers.Layer):
+ def __init__(self, config: DeiTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+ else:
+ self.intermediate_act_fn = config.hidden_act
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->DeiT
+class TFDeiTOutput(keras.layers.Layer):
+ def __init__(self, config: DeiTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(inputs=hidden_states, training=training)
+ hidden_states = hidden_states + input_tensor
+
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.intermediate_size])
+
+
+class TFDeiTLayer(keras.layers.Layer):
+ """This corresponds to the Block class in the timm implementation."""
+
+ def __init__(self, config: DeiTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.attention = TFDeiTAttention(config, name="attention")
+ self.intermediate = TFDeiTIntermediate(config, name="intermediate")
+ self.deit_output = TFDeiTOutput(config, name="output")
+
+ self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
+ self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
+ self.config = config
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ head_mask: tf.Tensor,
+ output_attentions: bool,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ attention_outputs = self.attention(
+ # in DeiT, layernorm is applied before self-attention
+ input_tensor=self.layernorm_before(inputs=hidden_states, training=training),
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ attention_output = attention_outputs[0]
+
+ # first residual connection
+ hidden_states = attention_output + hidden_states
+
+ # in DeiT, layernorm is also applied after self-attention
+ layer_output = self.layernorm_after(inputs=hidden_states, training=training)
+
+ intermediate_output = self.intermediate(hidden_states=layer_output, training=training)
+
+ # second residual connection is done here
+ layer_output = self.deit_output(
+ hidden_states=intermediate_output, input_tensor=hidden_states, training=training
+ )
+ outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them
+
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "attention", None) is not None:
+ with tf.name_scope(self.attention.name):
+ self.attention.build(None)
+ if getattr(self, "intermediate", None) is not None:
+ with tf.name_scope(self.intermediate.name):
+ self.intermediate.build(None)
+ if getattr(self, "deit_output", None) is not None:
+ with tf.name_scope(self.deit_output.name):
+ self.deit_output.build(None)
+ if getattr(self, "layernorm_before", None) is not None:
+ with tf.name_scope(self.layernorm_before.name):
+ self.layernorm_before.build([None, None, self.config.hidden_size])
+ if getattr(self, "layernorm_after", None) is not None:
+ with tf.name_scope(self.layernorm_after.name):
+ self.layernorm_after.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->DeiT
+class TFDeiTEncoder(keras.layers.Layer):
+ def __init__(self, config: DeiTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.layer = [TFDeiTLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ head_mask: tf.Tensor,
+ output_attentions: bool,
+ output_hidden_states: bool,
+ return_dict: bool,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+ all_hidden_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_outputs = layer_module(
+ hidden_states=hidden_states,
+ head_mask=head_mask[i],
+ output_attentions=output_attentions,
+ training=training,
+ )
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ # Add last layer
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+
+ return TFBaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "layer", None) is not None:
+ for layer in self.layer:
+ with tf.name_scope(layer.name):
+ layer.build(None)
+
+
+@keras_serializable
+class TFDeiTMainLayer(keras.layers.Layer):
+ config_class = DeiTConfig
+
+ def __init__(
+ self, config: DeiTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
+ ) -> None:
+ super().__init__(**kwargs)
+ self.config = config
+
+ self.embeddings = TFDeiTEmbeddings(config, use_mask_token=use_mask_token, name="embeddings")
+ self.encoder = TFDeiTEncoder(config, name="encoder")
+
+ self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+ self.pooler = TFDeiTPooler(config, name="pooler") if add_pooling_layer else None
+
+ def get_input_embeddings(self) -> TFDeiTPatchEmbeddings:
+ return self.embeddings.patch_embeddings
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ raise NotImplementedError
+
+ def get_head_mask(self, head_mask):
+ if head_mask is not None:
+ raise NotImplementedError
+ else:
+ head_mask = [None] * self.config.num_hidden_layers
+
+ return head_mask
+
+ @unpack_inputs
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ bool_masked_pos: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor, ...]]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ # TF 2.0 image layers can't use NCHW format when running on CPU.
+ # (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
+ pixel_values = tf.transpose(pixel_values, (0, 2, 3, 1))
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask)
+
+ embedding_output = self.embeddings(
+ pixel_values,
+ bool_masked_pos=bool_masked_pos,
+ training=training,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ )
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+ sequence_output = encoder_outputs[0]
+ sequence_output = self.layernorm(sequence_output, training=training)
+ pooled_output = self.pooler(sequence_output, training=training) if self.pooler is not None else None
+
+ if not return_dict:
+ head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+ return head_outputs + encoder_outputs[1:]
+
+ return TFBaseModelOutputWithPooling(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "embeddings", None) is not None:
+ with tf.name_scope(self.embeddings.name):
+ self.embeddings.build(None)
+ if getattr(self, "encoder", None) is not None:
+ with tf.name_scope(self.encoder.name):
+ self.encoder.build(None)
+ if getattr(self, "layernorm", None) is not None:
+ with tf.name_scope(self.layernorm.name):
+ self.layernorm.build([None, None, self.config.hidden_size])
+ if getattr(self, "pooler", None) is not None:
+ with tf.name_scope(self.pooler.name):
+ self.pooler.build(None)
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTPreTrainedModel with ViT->DeiT all-casing
+class TFDeiTPreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = DeiTConfig
+ base_model_prefix = "deit"
+ main_input_name = "pixel_values"
+
+
+DEIT_START_DOCSTRING = r"""
+ This model is a TensorFlow
+ [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
+ TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.
+
+ Parameters:
+ config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEIT_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+ [`DeiTImageProcessor.__call__`] for details.
+
+ head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+ Whether to interpolate the pre-trained position encodings.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare DeiT Model transformer outputting raw hidden-states without any specific head on top.",
+ DEIT_START_DOCSTRING,
+)
+class TFDeiTModel(TFDeiTPreTrainedModel):
+ def __init__(
+ self, config: DeiTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
+ ) -> None:
+ super().__init__(config, **kwargs)
+
+ self.deit = TFDeiTMainLayer(
+ config, add_pooling_layer=add_pooling_layer, use_mask_token=use_mask_token, name="deit"
+ )
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFBaseModelOutputWithPooling,
+ config_class=_CONFIG_FOR_DOC,
+ modality="vision",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ bool_masked_pos: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ training: bool = False,
+ ) -> Union[Tuple, TFBaseModelOutputWithPooling]:
+ outputs = self.deit(
+ pixel_values=pixel_values,
+ bool_masked_pos=bool_masked_pos,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ training=training,
+ )
+ return outputs
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deit", None) is not None:
+ with tf.name_scope(self.deit.name):
+ self.deit.build(None)
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTPooler with ViT->DeiT
+class TFDeiTPooler(keras.layers.Layer):
+ def __init__(self, config: DeiTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = keras.layers.Dense(
+ units=config.pooler_output_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ activation=config.pooler_act,
+ name="dense",
+ )
+ self.config = config
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ first_token_tensor = hidden_states[:, 0]
+ pooled_output = self.dense(inputs=first_token_tensor)
+
+ return pooled_output
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "dense", None) is not None:
+ with tf.name_scope(self.dense.name):
+ self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFDeitPixelShuffle(keras.layers.Layer):
+ """TF layer implementation of torch.nn.PixelShuffle"""
+
+ def __init__(self, upscale_factor: int, **kwargs) -> None:
+ super().__init__(**kwargs)
+ if not isinstance(upscale_factor, int) or upscale_factor < 2:
+ raise ValueError(f"upscale_factor must be an integer value >= 2 got {upscale_factor}")
+ self.upscale_factor = upscale_factor
+
+ def call(self, x: tf.Tensor) -> tf.Tensor:
+ hidden_states = x
+ batch_size, _, _, num_input_channels = shape_list(hidden_states)
+ block_size_squared = self.upscale_factor**2
+ output_depth = int(num_input_channels / block_size_squared)
+ # When the number of output channels >= 2, PyTorch's PixelShuffle and
+ # TF's depth_to_space differ in their output as the order of channels selected for combining
+ # is a permutation of the other c.f.
+ # https://stackoverflow.com/questions/68272502/tf-depth-to-space-not-same-as-torchs-pixelshuffle-when-output-channels-1
+ permutation = tf.constant(
+ [[i + j * block_size_squared for i in range(block_size_squared) for j in range(output_depth)]]
+ )
+ hidden_states = tf.gather(params=hidden_states, indices=tf.tile(permutation, [batch_size, 1]), batch_dims=-1)
+ hidden_states = tf.nn.depth_to_space(hidden_states, block_size=self.upscale_factor, data_format="NHWC")
+ return hidden_states
+
+
+class TFDeitDecoder(keras.layers.Layer):
+ def __init__(self, config: DeiTConfig, **kwargs) -> None:
+ super().__init__(**kwargs)
+ self.conv2d = keras.layers.Conv2D(
+ filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, name="0"
+ )
+ self.pixel_shuffle = TFDeitPixelShuffle(config.encoder_stride, name="1")
+ self.config = config
+
+ def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = inputs
+ hidden_states = self.conv2d(hidden_states)
+ hidden_states = self.pixel_shuffle(hidden_states)
+ return hidden_states
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "conv2d", None) is not None:
+ with tf.name_scope(self.conv2d.name):
+ self.conv2d.build([None, None, None, self.config.hidden_size])
+ if getattr(self, "pixel_shuffle", None) is not None:
+ with tf.name_scope(self.pixel_shuffle.name):
+ self.pixel_shuffle.build(None)
+
+
+@add_start_docstrings(
+ "DeiT Model with a decoder on top for masked image modeling, as proposed in"
+ " [SimMIM](https://arxiv.org/abs/2111.09886).",
+ DEIT_START_DOCSTRING,
+)
+class TFDeiTForMaskedImageModeling(TFDeiTPreTrainedModel):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__(config)
+
+ self.deit = TFDeiTMainLayer(config, add_pooling_layer=False, use_mask_token=True, name="deit")
+ self.decoder = TFDeitDecoder(config, name="decoder")
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ bool_masked_pos: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ training: bool = False,
+ ) -> Union[tuple, TFMaskedImageModelingOutput]:
+ r"""
+ bool_masked_pos (`tf.Tensor` of type bool and shape `(batch_size, num_patches)`):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+ Returns:
+
+ Examples:
+ ```python
+ >>> from transformers import AutoImageProcessor, TFDeiTForMaskedImageModeling
+ >>> import tensorflow as tf
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+ >>> model = TFDeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")
+
+ >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
+ >>> pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
+ >>> # create random boolean mask of shape (batch_size, num_patches)
+ >>> bool_masked_pos = tf.cast(tf.random.uniform((1, num_patches), minval=0, maxval=2, dtype=tf.int32), tf.bool)
+
+ >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+ >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
+ >>> list(reconstructed_pixel_values.shape)
+ [1, 3, 224, 224]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deit(
+ pixel_values,
+ bool_masked_pos=bool_masked_pos,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ training=training,
+ )
+
+ sequence_output = outputs[0]
+
+ # Reshape to (batch_size, num_channels, height, width)
+ sequence_output = sequence_output[:, 1:-1]
+ batch_size, sequence_length, num_channels = shape_list(sequence_output)
+ height = width = int(sequence_length**0.5)
+ sequence_output = tf.reshape(sequence_output, (batch_size, height, width, num_channels))
+
+ # Reconstruct pixel values
+ reconstructed_pixel_values = self.decoder(sequence_output, training=training)
+ # TF 2.0 image layers can't use NCHW format when running on CPU, so intermediate layers use NHWC,
+ # including the decoder. We transpose to compute the loss against the pixel values
+ # (batch_size, height, width, num_channels) -> (batch_size, num_channels, height, width)
+ reconstructed_pixel_values = tf.transpose(reconstructed_pixel_values, (0, 3, 1, 2))
+
+ masked_im_loss = None
+ if bool_masked_pos is not None:
+ size = self.config.image_size // self.config.patch_size
+ bool_masked_pos = tf.reshape(bool_masked_pos, (-1, size, size))
+ mask = tf.repeat(bool_masked_pos, self.config.patch_size, 1)
+ mask = tf.repeat(mask, self.config.patch_size, 2)
+ mask = tf.expand_dims(mask, 1)
+ mask = tf.cast(mask, tf.float32)
+
+ reconstruction_loss = keras.losses.mean_absolute_error(
+ # Swap axes as metric calculation reduces over the final dimension
+ tf.transpose(pixel_values, (1, 2, 3, 0)),
+ tf.transpose(reconstructed_pixel_values, (1, 2, 3, 0)),
+ )
+ reconstruction_loss = tf.expand_dims(reconstruction_loss, 0)
+ total_loss = tf.reduce_sum(reconstruction_loss * mask)
+ num_masked_pixels = (tf.reduce_sum(mask) + 1e-5) * self.config.num_channels
+ masked_im_loss = total_loss / num_masked_pixels
+ masked_im_loss = tf.reshape(masked_im_loss, (1,))
+
+ if not return_dict:
+ output = (reconstructed_pixel_values,) + outputs[1:]
+ return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+
+ return TFMaskedImageModelingOutput(
+ loss=masked_im_loss,
+ reconstruction=reconstructed_pixel_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deit", None) is not None:
+ with tf.name_scope(self.deit.name):
+ self.deit.build(None)
+ if getattr(self, "decoder", None) is not None:
+ with tf.name_scope(self.decoder.name):
+ self.decoder.build(None)
+
+
+@add_start_docstrings(
+ """
+ DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+ the [CLS] token) e.g. for ImageNet.
+ """,
+ DEIT_START_DOCSTRING,
+)
+class TFDeiTForImageClassification(TFDeiTPreTrainedModel, TFSequenceClassificationLoss):
+ def __init__(self, config: DeiTConfig):
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.deit = TFDeiTMainLayer(config, add_pooling_layer=False, name="deit")
+
+ # Classifier head
+ self.classifier = (
+ keras.layers.Dense(config.num_labels, name="classifier")
+ if config.num_labels > 0
+ else keras.layers.Activation("linear", name="classifier")
+ )
+ self.config = config
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ labels: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ training: bool = False,
+ ) -> Union[tf.Tensor, TFImageClassifierOutput]:
+ r"""
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, TFDeiTForImageClassification
+ >>> import tensorflow as tf
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> keras.utils.set_random_seed(3) # doctest: +IGNORE_RESULT
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> # note: we are loading a TFDeiTForImageClassificationWithTeacher from the hub here,
+ >>> # so the head will be randomly initialized, hence the predictions will be random
+ >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+ >>> model = TFDeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")
+
+ >>> inputs = image_processor(images=image, return_tensors="tf")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
+ >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
+ Predicted class: little blue heron, Egretta caerulea
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deit(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ training=training,
+ )
+
+ sequence_output = outputs[0]
+
+ logits = self.classifier(sequence_output[:, 0, :])
+ # we don't use the distillation token
+
+ loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFImageClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deit", None) is not None:
+ with tf.name_scope(self.deit.name):
+ self.deit.build(None)
+ if getattr(self, "classifier", None) is not None:
+ with tf.name_scope(self.classifier.name):
+ self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+ """
+ DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
+ the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.
+
+ .. warning::
+
+ This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+ supported.
+ """,
+ DEIT_START_DOCSTRING,
+)
+class TFDeiTForImageClassificationWithTeacher(TFDeiTPreTrainedModel):
+ def __init__(self, config: DeiTConfig) -> None:
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.deit = TFDeiTMainLayer(config, add_pooling_layer=False, name="deit")
+
+ # Classifier heads
+ self.cls_classifier = (
+ keras.layers.Dense(config.num_labels, name="cls_classifier")
+ if config.num_labels > 0
+ else keras.layers.Activation("linear", name="cls_classifier")
+ )
+ self.distillation_classifier = (
+ keras.layers.Dense(config.num_labels, name="distillation_classifier")
+ if config.num_labels > 0
+ else keras.layers.Activation("linear", name="distillation_classifier")
+ )
+ self.config = config
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
+ output_type=TFDeiTForImageClassificationWithTeacherOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+ )
+ def call(
+ self,
+ pixel_values: tf.Tensor | None = None,
+ head_mask: tf.Tensor | None = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
+ training: bool = False,
+ ) -> Union[tuple, TFDeiTForImageClassificationWithTeacherOutput]:
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.deit(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ interpolate_pos_encoding=interpolate_pos_encoding,
+ training=training,
+ )
+
+ sequence_output = outputs[0]
+
+ cls_logits = self.cls_classifier(sequence_output[:, 0, :])
+ distillation_logits = self.distillation_classifier(sequence_output[:, 1, :])
+
+ # during inference, return the average of both classifier predictions
+ logits = (cls_logits + distillation_logits) / 2
+
+ if not return_dict:
+ output = (logits, cls_logits, distillation_logits) + outputs[1:]
+ return output
+
+ return TFDeiTForImageClassificationWithTeacherOutput(
+ logits=logits,
+ cls_logits=cls_logits,
+ distillation_logits=distillation_logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def build(self, input_shape=None):
+ if self.built:
+ return
+ self.built = True
+ if getattr(self, "deit", None) is not None:
+ with tf.name_scope(self.deit.name):
+ self.deit.build(None)
+ if getattr(self, "cls_classifier", None) is not None:
+ with tf.name_scope(self.cls_classifier.name):
+ self.cls_classifier.build([None, None, self.config.hidden_size])
+ if getattr(self, "distillation_classifier", None) is not None:
+ with tf.name_scope(self.distillation_classifier.name):
+ self.distillation_classifier.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+ "TFDeiTForImageClassification",
+ "TFDeiTForImageClassificationWithTeacher",
+ "TFDeiTForMaskedImageModeling",
+ "TFDeiTModel",
+ "TFDeiTPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/deprecated/__init__.py b/docs/transformers/build/lib/transformers/models/deprecated/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e293c354e1e92a431a601da77d7555f2ecfe29ef
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/deprecated/__init__.py
@@ -0,0 +1,49 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .bort import *
+ from .deta import *
+ from .efficientformer import *
+ from .ernie_m import *
+ from .gptsan_japanese import *
+ from .graphormer import *
+ from .jukebox import *
+ from .mctct import *
+ from .mega import *
+ from .mmbt import *
+ from .nat import *
+ from .nezha import *
+ from .open_llama import *
+ from .qdqbert import *
+ from .realm import *
+ from .retribert import *
+ from .speech_to_text_2 import *
+ from .tapex import *
+ from .trajectory_transformer import *
+ from .transfo_xl import *
+ from .tvlt import *
+ from .van import *
+ from .vit_hybrid import *
+ from .xlm_prophetnet import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/filter_duration.py b/filter_duration.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bfbe2c1d3150430d4230481615af30b18cbc893
--- /dev/null
+++ b/filter_duration.py
@@ -0,0 +1,129 @@
+import json
+import torchaudio
+from tqdm import tqdm
+import os
+import sys
+from collections import defaultdict
+
+def filter_audio_duration(jsonl_path, max_duration=90.0):
+ """筛选JSONL文件中时长在指定秒数以下的音频文件"""
+ stats = defaultdict(int)
+ filtered_data = []
+ error_log = []
+
+ # 第一次遍历:统计总行数(用于进度条)
+ with open(jsonl_path, 'r') as f:
+ total_lines = sum(1 for _ in f)
+
+ # 第二次遍历:实际筛选
+ with open(jsonl_path, 'r') as f:
+ for line_num, line in enumerate(tqdm(f, total=total_lines, desc="筛选进度", unit="line")):
+ try:
+ data = json.loads(line.strip())
+ if 'audios' not in data or not data['audios']:
+ stats['no_audio_field'] += 1
+ continue
+
+ valid_audios = []
+ all_audios_valid = True
+
+ for audio_path in data['audios']:
+ # 检查文件是否存在
+ if not os.path.exists(audio_path):
+ stats['missing'] += 1
+ error_log.append(f"[行{line_num+1}] 缺失文件: {audio_path}")
+ all_audios_valid = False
+ continue
+
+ # 检查文件大小
+ if os.path.getsize(audio_path) == 0:
+ stats['zero_size'] += 1
+ error_log.append(f"[行{line_num+1}] 空文件: {audio_path}")
+ all_audios_valid = False
+ continue
+
+ # 验证音频内容和时长
+ try:
+ waveform, sr = torchaudio.load(audio_path)
+ if waveform.numel() == 0:
+ stats['empty_audio'] += 1
+ error_log.append(f"[行{line_num+1}] 空音频: {audio_path}")
+ all_audios_valid = False
+ continue
+
+ # 计算音频时长(秒)
+ duration = waveform.shape[1] / sr
+
+ if duration > max_duration:
+ stats['too_long'] += 1
+ error_log.append(f"[行{line_num+1}] 时长过长({duration:.2f}s): {audio_path}")
+ all_audios_valid = False
+ continue
+ else:
+ stats['valid'] += 1
+ valid_audios.append(audio_path)
+
+ except Exception as e:
+ stats['corrupted'] += 1
+ error_type = str(e).split('(')[0]
+ error_log.append(f"[行{line_num+1}] 损坏文件({error_type}): {audio_path}")
+ all_audios_valid = False
+ continue
+
+ # 如果所有音频都有效且时长符合要求,保留这个样本
+ if all_audios_valid and valid_audios:
+ # 更新audios字段为筛选后的音频列表
+ data['audios'] = valid_audios
+ filtered_data.append(data)
+ stats['kept'] += 1
+ else:
+ stats['filtered_out'] += 1
+
+ except json.JSONDecodeError:
+ stats['invalid_json'] += 1
+ error_log.append(f"[行{line_num+1}] 无效JSON格式")
+
+ # 保存筛选后的数据
+ output_path = f"{os.path.splitext(jsonl_path)[0]}_filtered_{max_duration}s.jsonl"
+ with open(output_path, 'w', encoding='utf-8') as f:
+ for data in filtered_data:
+ f.write(json.dumps(data, ensure_ascii=False) + '\n')
+
+ # 打印统计报告
+ print("\n===== 筛选报告 =====")
+ print(f"最大时长限制: {max_duration}秒")
+ print(f"总行数: {total_lines}")
+ print(f"保留样本: {stats['kept']}")
+ print(f"过滤样本: {stats['filtered_out']}")
+ print("--- 详细统计 ---")
+ for k, v in sorted(stats.items()):
+ print(f"{k}: {v}")
+
+ # 保存错误日志
+ if error_log:
+ log_file = f"{os.path.splitext(jsonl_path)[0]}_duration_filter_errors.log"
+ with open(log_file, 'w', encoding='utf-8') as f:
+ f.write("\n".join(error_log))
+ print(f"\n发现 {len(error_log)} 个问题,已保存到 {log_file}")
+
+ print(f"\n筛选后的数据已保存到: {output_path}")
+
+if __name__ == "__main__":
+ if len(sys.argv) < 2:
+ print("使用方法: python filter_duration.py [max_duration]")
+ print("默认最大时长: 100秒")
+ sys.exit(1)
+
+ if not os.path.exists(sys.argv[1]):
+ print(f"错误: 文件 {sys.argv[1]} 不存在")
+ sys.exit(1)
+
+ max_duration = 90.0
+ if len(sys.argv) >= 3:
+ try:
+ max_duration = float(sys.argv[2])
+ except ValueError:
+ print(f"错误: 无效的时长参数 {sys.argv[2]}")
+ sys.exit(1)
+
+ filter_audio_duration(sys.argv[1], max_duration)
\ No newline at end of file